In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import preprocessor as p
stopwords = nltk.corpus.stopwords.words('english')
pd.set_option('display.max_colwidth', 100)

In [2]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,author,content,country,date_time,id,language,latitude,longitude,number_of_likes,number_of_shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEHATE https://t.co/ngG11quhmK,,12/01/2017 19:52,8.19633e+17,en,,,7900,3472
1,katyperry,@barackobama Thank you for your incredible grace in leadership and for being an exceptional… htt...,,11/01/2017 08:38,8.19101e+17,en,,,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,,11/01/2017 02:52,8.19014e+17,en,,,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,,11/01/2017 02:44,8.19012e+17,en,,,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ https://t.co/0shuUYUBEv,,10/01/2017 05:22,8.18689e+17,en,,,17620,4655


In [3]:
print(f'There are {df.shape[0]} columns and {df.shape[1]} rows') 

There are 52542 columns and 10 rows


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52542 entries, 0 to 52541
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   author            52542 non-null  object 
 1   content           52542 non-null  object 
 2   country           36 non-null     object 
 3   date_time         52542 non-null  object 
 4   id                52542 non-null  float64
 5   language          52542 non-null  object 
 6   latitude          1 non-null      float64
 7   longitude         1 non-null      float64
 8   number_of_likes   52542 non-null  int64  
 9   number_of_shares  52542 non-null  int64  
dtypes: float64(3), int64(2), object(5)
memory usage: 4.0+ MB


Let's drop some columns which not be useful to machine learning models

#### Create Function to Clean Text

- Removing Punctuation
- Tokenizing
- Removing Stopwords

In [5]:
# dropping unnecessary columns
df = df.drop(columns=[
    'country', 'date_time', 'id', 'language', 
    'latitude', 'longitude'], axis=1)
df.columns = ['author', 'text', 'likes', 'shares']
df.head()

Unnamed: 0,author,text,likes,shares
0,katyperry,Is history repeating itself...?#DONTNORMALIZEHATE https://t.co/ngG11quhmK,7900,3472
1,katyperry,@barackobama Thank you for your incredible grace in leadership and for being an exceptional… htt...,3689,1380
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,10341,2387
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,10774,2458
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ https://t.co/0shuUYUBEv,17620,4655


In [6]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
sentences = ["Check out this cool website: https://example.com",
             "Hey @username, nice to meet you!",
             "Using #Python for data analysis is awesome!"]
sentences

for sentence in sentences:
    tokens = re.split('\W+', sentence)
    # empty = [word for word in tokens if word not in '']
    print(tokens)

['Check', 'out', 'this', 'cool', 'website', 'https', 'example', 'com']
['Hey', 'username', 'nice', 'to', 'meet', 'you', '']
['Using', 'Python', 'for', 'data', 'analysis', 'is', 'awesome', '']


In [9]:
# Initialize preprocessor options
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)
# Iterate over each sentence
for sentence in sentences:
    # Clean the sentence using preprocessor
    cleaned_sentence = p.clean(sentence)
    text = ''.join([word.lower() for word in cleaned_sentence if word not in string.punctuation])
    print(text)

check out this cool website
hey  nice to meet you
using for data analysis is awesome


In [10]:
# Initialize preprocessor options
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG)
def clean_text(text):
    # Remove URLs, hashtags, and mentions using preprocessor
    cleaned_text = p.clean(text)
    # Remove punctuation and convert to lowercase
    text = ''.join([word.lower() for word in cleaned_text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    empty_char = [word for word in tokens if word not in '']
    text = [word for word in empty_char if word not in stopwords]
    return text

In [11]:
df['clean_text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,author,text,likes,shares,clean_text
0,katyperry,Is history repeating itself...?#DONTNORMALIZEHATE https://t.co/ngG11quhmK,7900,3472,"[history, repeating]"
1,katyperry,@barackobama Thank you for your incredible grace in leadership and for being an exceptional… htt...,3689,1380,"[thank, incredible, grace, leadership, exceptional]"
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,10341,2387,"[life, goals]"
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,10774,2458,[right]
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ https://t.co/0shuUYUBEv,17620,4655,"[sisters, doin]"


Let's find out if there are empty arrays after the preprocessing

In [16]:
df[df['clean_text'].apply(len) == 0]

Unnamed: 0,author,text,likes,shares,clean_text
23,katyperry,Same https://t.co/98vBUp7BzI,4236,1143,[]
59,katyperry,🔮 https://t.co/aFzxsIRfhn,9348,2395,[]
66,katyperry,🎣🐟🐠🐋,10197,4432,[]
151,katyperry,🎣,7885,3039,[]
168,katyperry,#KarlBecker,4786,1308,[]
...,...,...,...,...,...
52451,ddlovato,❤️🌴☀️💄💋 http://t.co/64FlAWsv2s,15489,10967,[]
52485,ddlovato,#nudezz http://t.co/Q0H0RNjQUf,14845,9875,[]
52504,ddlovato,#sketchers #buttsweat 💪👍💦 http://t.co/YEFHSB16p2,14292,9012,[]
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,15985,10456,[]


Let's remove 2,792 rows of empty arrays as this won't be suitable for our model

In [19]:
df.drop(df[df['clean_text'].apply(len) == 0].index, inplace=True)

In [23]:
# Sanity Check
df[df['clean_text'].apply(len) == 0]

Unnamed: 0,author,text,likes,shares,clean_text
