Link to intro to dataset processing:

<https://nbviewer.jupyter.org/github/jennselby/MachineLearningCourseNotes/blob/master/assets/ipynb/IntroductiontoDatasetProcessing.ipyn>

Link to dataset:

<https://www.kaggle.com/datacrux/barack-obama-twitterdata-from-20122019>

# Setup

In [2]:
# imports

import sklearn
import pandas
from sklearn.preprocessing import StandardScaler # for standardization
from sklearn.feature_extraction.text import CountVectorizer # for counting words

In [3]:
# import data into dataframe

df = pandas.read_csv('./tweets-BarackObama.csv')
print(len(df.index), "rows")
df.head()

6851 rows


Unnamed: 0,Date,Username,Tweet-text,Tweet Link,Retweets,Likes,TweetImageUrl,Image
0,2019/04/10_17:08,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133,,
1,2019/04/06_06:16,https://twitter.com/BarackObama,In just a few minutes I’m taking the stage at ...,https://twitter.com/BarackObama/status/1114517...,5206,36179,,
2,2019/04/05_09:25,https://twitter.com/BarackObama,A voice everybody should hear.https://twitter....,https://twitter.com/BarackObama/status/1114202...,27938,134604,,
3,2019/04/04_12:23,https://twitter.com/BarackObama,Here’s a story about people doing good that's ...,https://twitter.com/BarackObama/status/1113884...,15396,78627,,
4,2019/04/03_12:07,https://twitter.com/BarackObama,Great to see Chicago’s historic mayoral race b...,https://twitter.com/BarackObama/status/1113518...,8991,98094,,


# Start cleaning stuff up

In [4]:
# Seperate date column into date and time
# We could go even further and seperate date into year, month, and day and time into hours and minutes, but this is good enough for now

dates = []
times = []

for index, row, in df.iterrows():
    
    s = row["Date"].split("_")
    
    date = s[0]
    time = s[1]
    
    dates.append(date)
    times.append(time)
    
# I could also do something like this

# df_temp = df['Date'].str.split('_', expand=True)
# df = pandas.concat([df, df_temp], axis=1, sort=False)

In [5]:
# Delete original "Date" column, add in new "Date" and "Time" columns

df = df.drop("Date", axis=1)
df.insert(0, "Date", dates) # insert column at beginning
df.insert(1, "Time", times) # insert column at beginning
df.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets,Likes,TweetImageUrl,Image
0,2019/04/10,17:08,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133,,
1,2019/04/06,06:16,https://twitter.com/BarackObama,In just a few minutes I’m taking the stage at ...,https://twitter.com/BarackObama/status/1114517...,5206,36179,,
2,2019/04/05,09:25,https://twitter.com/BarackObama,A voice everybody should hear.https://twitter....,https://twitter.com/BarackObama/status/1114202...,27938,134604,,
3,2019/04/04,12:23,https://twitter.com/BarackObama,Here’s a story about people doing good that's ...,https://twitter.com/BarackObama/status/1113884...,15396,78627,,
4,2019/04/03,12:07,https://twitter.com/BarackObama,Great to see Chicago’s historic mayoral race b...,https://twitter.com/BarackObama/status/1113518...,8991,98094,,


In [6]:
# Remove colons from the time column (we could also do this to the date column but I'm just not doing that here)
# There is probably a way to do this with regex and something like df.replace("\d\d(:)\d\d", "") but I'm not sure how

new_times = []

for index, row, in df.iterrows():
    
    time = row["Time"]
    time = time.replace(":", "")
    
    new_times.append(time)
    
df = df.drop("Time", axis=1)
df.insert(1, "Time", new_times)
df.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets,Likes,TweetImageUrl,Image
0,2019/04/10,1708,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133,,
1,2019/04/06,616,https://twitter.com/BarackObama,In just a few minutes I’m taking the stage at ...,https://twitter.com/BarackObama/status/1114517...,5206,36179,,
2,2019/04/05,925,https://twitter.com/BarackObama,A voice everybody should hear.https://twitter....,https://twitter.com/BarackObama/status/1114202...,27938,134604,,
3,2019/04/04,1223,https://twitter.com/BarackObama,Here’s a story about people doing good that's ...,https://twitter.com/BarackObama/status/1113884...,15396,78627,,
4,2019/04/03,1207,https://twitter.com/BarackObama,Great to see Chicago’s historic mayoral race b...,https://twitter.com/BarackObama/status/1113518...,8991,98094,,


In [7]:
# Delete "Image" column because there are no image thumbnails (all the values are NaN)

df = df.drop("Image", axis=1)
df.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets,Likes,TweetImageUrl
0,2019/04/10,1708,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133,
1,2019/04/06,616,https://twitter.com/BarackObama,In just a few minutes I’m taking the stage at ...,https://twitter.com/BarackObama/status/1114517...,5206,36179,
2,2019/04/05,925,https://twitter.com/BarackObama,A voice everybody should hear.https://twitter....,https://twitter.com/BarackObama/status/1114202...,27938,134604,
3,2019/04/04,1223,https://twitter.com/BarackObama,Here’s a story about people doing good that's ...,https://twitter.com/BarackObama/status/1113884...,15396,78627,
4,2019/04/03,1207,https://twitter.com/BarackObama,Great to see Chicago’s historic mayoral race b...,https://twitter.com/BarackObama/status/1113518...,8991,98094,


# Dealing with the image urls

In [8]:
# We could just delete this column if we have no use for any images in any tweets

df_1 = df.drop("TweetImageUrl", axis=1)
df_1.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets,Likes
0,2019/04/10,1708,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133
1,2019/04/06,616,https://twitter.com/BarackObama,In just a few minutes I’m taking the stage at ...,https://twitter.com/BarackObama/status/1114517...,5206,36179
2,2019/04/05,925,https://twitter.com/BarackObama,A voice everybody should hear.https://twitter....,https://twitter.com/BarackObama/status/1114202...,27938,134604
3,2019/04/04,1223,https://twitter.com/BarackObama,Here’s a story about people doing good that's ...,https://twitter.com/BarackObama/status/1113884...,15396,78627
4,2019/04/03,1207,https://twitter.com/BarackObama,Great to see Chicago’s historic mayoral race b...,https://twitter.com/BarackObama/status/1113518...,8991,98094


In [9]:
# Or if we are running analysis on the tweets with images (and we can't use tweets without images), we can delete every row without an image url

df_2 = df.dropna()
df_2.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets,Likes,TweetImageUrl
5,2019/04/02,608,https://twitter.com/BarackObama,Valerie is one of my oldest friends and adviso...,https://twitter.com/BarackObama/status/1113065...,6839,66187,https://pbs.twimg.com/media/D3JnZdBVYAEVq8U.jpg
6,2019/03/26,1133,https://twitter.com/BarackObama,Last night I had the chance to meet with first...,https://twitter.com/BarackObama/status/1110610...,14325,117989,https://pbs.twimg.com/media/D2musJoWsAIFVwg.jpg
7,2019/03/21,843,https://twitter.com/BarackObama,Just in the nick of time: My brackets have nev...,https://twitter.com/BarackObama/status/1108756...,8457,59574,https://pbs.twimg.com/media/D2MXyrbWsAAdVSz.jpg
10,2019/03/17,656,https://twitter.com/BarackObama,In 2011 I visited the tiny town of Moneygall a...,https://twitter.com/BarackObama/status/1107279...,23726,244904,https://pbs.twimg.com/media/D13Y3rOWsAA42ma.jpg
15,2019/03/08,803,https://twitter.com/BarackObama,The Crew Dragon’s been on quite a ride since I...,https://twitter.com/BarackObama/status/1104049...,10923,130875,https://pbs.twimg.com/media/D1JfmpgXcAUJo_R.jpg


# Bag of words the tweets (for final project)

In [22]:
bow = []

for index, row in df_2.iterrows():
    
    tweet = row["Tweet-text"]
    
    words_list = tweet.split()
    words_dict = {}
    
    for w in words_list:
        if w not in words_dict:
            words_dict[w] = 1
        else:
            words_dict[w] += 1

    bow.append(words_dict)
    
df_2["bow"] = bow
df_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2["bow"] = bow


Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets,Likes,TweetImageUrl,bow
5,2019/04/02,608,https://twitter.com/BarackObama,Valerie is one of my oldest friends and adviso...,https://twitter.com/BarackObama/status/1113065...,6839,66187,https://pbs.twimg.com/media/D3JnZdBVYAEVq8U.jpg,"{'Valerie': 1, 'is': 1, 'one': 1, 'of': 2, 'my..."
6,2019/03/26,1133,https://twitter.com/BarackObama,Last night I had the chance to meet with first...,https://twitter.com/BarackObama/status/1110610...,14325,117989,https://pbs.twimg.com/media/D2musJoWsAIFVwg.jpg,"{'Last': 1, 'night': 1, 'I': 1, 'had': 1, 'the..."
7,2019/03/21,843,https://twitter.com/BarackObama,Just in the nick of time: My brackets have nev...,https://twitter.com/BarackObama/status/1108756...,8457,59574,https://pbs.twimg.com/media/D2MXyrbWsAAdVSz.jpg,"{'Just': 1, 'in': 1, 'the': 1, 'nick': 1, 'of'..."
10,2019/03/17,656,https://twitter.com/BarackObama,In 2011 I visited the tiny town of Moneygall a...,https://twitter.com/BarackObama/status/1107279...,23726,244904,https://pbs.twimg.com/media/D13Y3rOWsAA42ma.jpg,"{'In': 1, '2011': 1, 'I': 1, 'visited': 1, 'th..."
15,2019/03/08,803,https://twitter.com/BarackObama,The Crew Dragon’s been on quite a ride since I...,https://twitter.com/BarackObama/status/1104049...,10923,130875,https://pbs.twimg.com/media/D1JfmpgXcAUJo_R.jpg,"{'The': 1, 'Crew': 1, 'Dragon’s': 1, 'been': 1..."


# Standardization

In [205]:
# Now if we want to standardize the retweets and likes values...

scaler = StandardScaler()
df_s = pandas.DataFrame(data=scaler.fit_transform(df[["Retweets", "Likes"]]), columns=["Retweets", "Likes"])
df_s.head()

Unnamed: 0,Retweets,Likes
0,0.52462,1.408344
1,0.032596,0.193133
2,0.742825,1.053901
3,0.350968,0.564358
4,0.150853,0.734605


In [206]:
# We can add these standardized values back into the original dataframe and rename the original columns to keep things organized

df = df.rename(columns={"Retweets": "Retweets (og)", "Likes": "Likes (og)"})

df.insert(7, "Retweets (s)", df_s[["Retweets"]])
df.insert(8, "Likes (s)", df_s[["Likes"]])

df.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets (og),Likes (og),Retweets (s),Likes (s),TweetImageUrl
0,2019/04/10,1708,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133,0.52462,1.408344,
1,2019/04/06,616,https://twitter.com/BarackObama,In just a few minutes I’m taking the stage at ...,https://twitter.com/BarackObama/status/1114517...,5206,36179,0.032596,0.193133,
2,2019/04/05,925,https://twitter.com/BarackObama,A voice everybody should hear.https://twitter....,https://twitter.com/BarackObama/status/1114202...,27938,134604,0.742825,1.053901,
3,2019/04/04,1223,https://twitter.com/BarackObama,Here’s a story about people doing good that's ...,https://twitter.com/BarackObama/status/1113884...,15396,78627,0.350968,0.564358,
4,2019/04/03,1207,https://twitter.com/BarackObama,Great to see Chicago’s historic mayoral race b...,https://twitter.com/BarackObama/status/1113518...,8991,98094,0.150853,0.734605,


In [207]:
# Now that we're done cleaning things up, lets find all of Obama's tweets which mention the nba!

condition = df['Tweet-text'].str.contains('NBA')
filtered_data = df[condition]
filtered_data.head()

Unnamed: 0,Date,Time,Username,Tweet-text,Tweet Link,Retweets (og),Likes (og),Retweets (s),Likes (s),TweetImageUrl
0,2019/04/10,1708,https://twitter.com/BarackObama,From a big NBA fan congrats to future Hall of ...,https://twitter.com/BarackObama/status/1116130...,20954,175133,0.52462,1.408344,
24,2019/02/16,1240,https://twitter.com/BarackObama,I’ve always loved basketball because it’s abou...,https://twitter.com/BarackObama/status/1096871...,23270,160902,0.59698,1.283888,
5709,2013/06/20,1611,https://twitter.com/BarackObama,Nothing better than a Game 7 for all the marbl...,https://twitter.com/BarackObama/status/3478543...,5565,1913,0.043812,-0.106538,
5747,2013/06/14,1332,https://twitter.com/BarackObama,If you're from Indiana you'd better have a goo...,https://twitter.com/BarackObama/status/3456398...,622,372,-0.110625,-0.120015,
5753,2013/06/13,1712,https://twitter.com/BarackObama,Don't miss @selcharrodeoro's encore performanc...,https://twitter.com/BarackObama/status/3453328...,2543,1757,-0.050606,-0.107902,


In [208]:
# We can also find his most commonly used word

# Create a string made up of every single tweet
all_tweets = ""
for index, row, in df.iterrows():  
    all_tweets = all_tweets + " " + row["Tweet-text"]
        
count_vect = CountVectorizer() 
counts = count_vect.fit_transform([all_tweets])
# word_counts_df = pd.DataFrame(data=counts.todense())
word_nums = count_vect.vocabulary_
print(max(word_nums, key=word_nums.get))

zyyrca


Hmmm, that doesn't seem to be working quite properly... (although when I print out word_nums and eyeball things "zyyrca" does seem to have the largest value)

In [None]:
for label, content in df_new.items():
    content = content.tolist()
    if content[0] == "Positive" or content[0] == "Negative":
        print(label)