## Text cleaning and Preprocessing

In [1]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import spacy

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitter4000.csv',encoding= 'latin-1')

In [5]:
df

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0
...,...,...
3995,i just graduated,1
3996,Templating works; it all has to be done,1
3997,mommy just brought me starbucks,1
3998,@omarepps watching you on a House re-run...lov...,1


In [6]:
df['sentiment'].value_counts()

sentiment
0    2000
1    2000
Name: count, dtype: int64

### Word Counts

In [7]:
df['word_counts'] = df['twitts'].apply(lambda x: len(str(x).split()))

In [8]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts
3722,"This is gonna be our last race, need to eat so...",1,12
1355,@LiesThatBlind Uh-Oh.,0,2
2934,she calls me uthie . and i call her ujhie . sh...,1,26
2716,I'm so in love with Anthony Green's voice. Go ...,1,18
781,awww. i passed my 100th tweet and I didn't no...,0,10


In [9]:
df['word_counts'].max()

32

In [10]:
df['word_counts'].min()

1

In [11]:
df['word_counts'] == 1

0       False
1       False
2       False
3       False
4       False
        ...  
3995    False
3996    False
3997    False
3998    False
3999    False
Name: word_counts, Length: 4000, dtype: bool

In [12]:
df[df['word_counts']==1]

Unnamed: 0,twitts,sentiment,word_counts
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


### Character Counts

In [13]:
def char_counts(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

In [14]:
char_counts('I am wide awake')

12

In [15]:
df['char_counts'] = df['twitts'].apply(lambda x: char_counts(str(x)))

In [16]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts
3977,Five minutes.....and counting....,1,3,31
3058,Enjoying a cold beer after a hard days work,1,9,35
789,"49 more days. oh, and Happy Fathers Day i...",0,17,71
1537,@adamarmsup &lt;3 you Adam.,0,4,24
3304,@kevinwweaver he could seriously learn a thing...,1,12,60


### Average Word Length

In [17]:
df['avg_word_len'] = df['char_counts']/df['word_counts']

In [18]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len
697,Bedtime. I've got a shitload of laundry to do ...,0,10,45,4.5
19,@KimberleyMtkg Hey there! No can do! Travelin...,0,18,81,4.5
1884,so much homework.,0,3,15,5.0
2258,Sitting in my couch and looking out the window...,1,22,80,3.636364
3973,@Navinesh still waiting for mine I may need ...,1,14,53,3.785714


### Stop Words Count

In [19]:
print(stopwords)

{'may', 'that', 'various', 'beforehand', 'sixty', '’ll', 'eleven', 'elsewhere', 'again', 'further', 'throughout', 'neither', 're', 'ten', 'twelve', 'some', "'ll", 'bottom', 'become', 'about', 'among', 'because', 'nine', 'over', 'last', 'quite', 'if', 'take', 'call', 'less', 'this', 'meanwhile', 'whom', 'across', 'since', 'wherein', 'beside', 'is', 'onto', 'any', 'give', 'i', 'through', 'wherever', 'what', '‘s', 'his', "'ve", 'too', 'always', 'cannot', 'after', 'enough', 'front', 'anything', "'d", 'anyhow', 'yourselves', 'put', 'using', 'around', 'itself', 'mine', 'via', 'thereafter', 'beyond', 'six', 'please', 'therein', 'back', 'yours', 'have', "'s", 'per', 'every', 'above', '’re', 'third', 'did', 'so', "'m", 'often', 'thru', 'very', 'your', 'both', 'myself', 'ours', 'many', 'somewhere', 'sometimes', 'few', 'no', 'moreover', 'indeed', 'none', 'noone', 'those', 'anyway', 'doing', 'most', 'never', 'sometime', 'can', 'nothing', 'could', 'latterly', 'whatever', 'has', 'thereby', 'me', 'fi

In [20]:
len(stopwords)

326

In [21]:
df['stop_words_len'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t in stopwords]))

### #Hashtags and @Mentions Count

In [22]:
# [t for t in x.split() if t.starstwith('#')]

In [23]:
df['hashtags_count'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t.startswith('#')]))

In [24]:
df['mentions_count'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))

In [25]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count
186,catching up on some tv. i miss dvr,0,8,27,3.375,4,0,0
2773,"@tehkseven @Cali_Breezy, Yea. I can confirm th...",1,19,97,5.105263,6,0,2
534,cleaning the house! doesn't that sound like su...,0,9,49,5.444444,2,0,0
3026,Heading back to the R.C. tomorrow. Great vaca...,1,8,43,5.375,3,0,0
1417,Several beers turned into a lot of beers and n...,0,14,54,3.857143,5,0,0


### Numeric Digits Count

In [26]:
x = 'this is 1 and 2'
x.split()

['this', 'is', '1', 'and', '2']

In [27]:
x.split()[4].isdigit()

True

In [28]:
[t for t in x.split() if t.isdigit()]

['1', '2']

In [29]:
df['numeric_count'] = df['twitts'].apply(lambda x : len([t for t in x.split() if t.isdigit()]))

In [30]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count
739,I also could really use a nap.,0,7,24,3.428571,4,0,0,0
366,"Yep, its official......I dont feel well.",0,6,35,5.833333,1,0,0,0
35,"I was rollin' up Prince Ave, heard all the sir...",0,17,69,4.058824,7,0,0,0
1482,@sablevenus Lol. I don't even bother. Sad times.,0,8,41,5.125,1,0,1,0
3594,"@pdurham uh oh, now you have to be nice.",1,9,32,3.555556,5,0,1,0


### Upper Case Words Count

In [31]:
x = "I GOT THE JOB"
y = "I got the job"

In [32]:
[t for t in x.split() if t.isupper()]

['I', 'GOT', 'THE', 'JOB']

In [33]:
[t for t in y.split() if t.isupper()]

['I']

In [34]:
df['upper_count'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t.isupper()]))

In [35]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
1694,Boarded my flight but forgot my QC3s,0,7,30,4.285714,3,0,0,0,0
1446,i'm missing my wife at the MTV Movie Awards!! ...,0,19,80,4.210526,7,0,0,0,1
3020,@Jonasbrothers prob have to be 'fly with me' i...,1,10,48,4.8,4,0,1,0,0
1001,"Stayed in the garden all day and didnt tan, Bu...",0,26,98,3.769231,12,0,0,0,0
2119,@MetsMerized it was kind of funny late last ni...,1,20,93,4.65,8,0,1,0,0


In [36]:
df.iloc[483]['twitts']

"Hear it's raining in SF  Oh well, we will probably go the the Eagle for beer bust anyway."

### Lower Case Conversion

In [37]:
df['twitts'] = df['twitts'].apply(lambda x: str(x).lower())

In [38]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
998,@respectmileyc i stil want to! but have u hear...,0,25,106,4.24,10,0,1,0,1
3259,oops i mean the software update comes on wedne...,1,24,108,4.5,7,0,0,0,2
659,headed to the hospital for surgery. which mean...,0,14,66,4.714286,4,0,0,0,0
2664,@jesse236 yeah...just got it. today...,1,5,34,6.8,0,0,1,0,0
2541,i'm off to pick up a package from the post off...,1,23,82,3.565217,10,0,0,0,2


## Contraction to Expansion

In [39]:
# x = "don't shouldn't, i'll "  # do not should not i will

In [None]:
pip install contractions

In [41]:
import contractions

In [42]:
custom_contractions = {
"a’ight": "alright",
"ain’t" : "i am not",
"amn’t": "am not",
"arencha" : "are not you",
"aren’t": "are not",
"’bout": "about",
"boy's": "boy has",
"can’t": "cannot",
"cap’n": "captain",
"’cause" : "because",
"cuz": "because",
"’cept": "except",
"could’ve": "could have",
"couldn’t": "could not",
"couldn’t’ve" :	"could not have",
"cuppa" : "cup of",
"daren’t" : "dare not",
"daresn’t" : "dare not",
"dasn’t" : "dare not",
"didn’t": "did not",
"doesn't": "does not",
"don’t": "do not",
"dunno"	: "do not know",
"d’ye" : "do you",
"d’ya": "did you",
"e’en":	"even",
"e’er": "ever",
"’em": "them",
"everybody’s": "everybody is",
"everyone’s": "everyone is",
"everything's": "everything is",
"finna": "fixing to",
"fo’c’sle": "forecastle",
"’gainst" : "against",
"g’day":"good day",
"gimme": "give me",
"girl's": "girl is",
"giv’n": "given",
"gi’z": "give us",
"gonna":"going to",
"gon’t": "go not",
"gotta": "got to",
"guy's": "guy is",
"hadn’t" :	"had not",
"had’ve": "had have",
"hasn’t": "has not",
"haven’t":	"have not",
"he’d": "he had",
"he’d": "he would",
"he'll": "he will",
"helluva": "hell of a",
"yes'nt":" yes not",
"he’s ": "he is",
"here’s": "here is",
"how’d": "how did",
"howdy": "how do you do",
"how’ll": "how shall",
"how’re": "how are",
"how’s": "how is",
"i’d":	"I would",
"i’d’ve": "I would have",
"i’d’nt": "I would not",
"i’d’nt’ve": "I would not have",
"if’n": "If and when",
"i’ll": "I will",
"i’m": "I am",
"imma": "I am going to",
"i’mo": "I am going to",
"innit": "isn’t it",
"ion": "I do not",
"i’ve":	"I have",
"isn’t":" is not",
"it’d": "it would",
"it’ll": "it will",
"it’s": "it is",
"idunno": "I don’t know",
"kinda": "kind of",
"let’s": "let us",
"loven’t": "love not",
"ma’am": "madam",
"mayn’t": "may not",
"may’ve": "may have",
"methinks": "I think",
"mightn’t": "might not",
"might’ve": "might have",
"mine’s": "mine is",
"mustn’t": "must not",
"mustn’t’ve": "must not have",
"must’ve": "must have",
"’neath": "beneath",
"needn’t": "need not",
"nal": "and all",
"ne’er": "never",
"o’er":	"over",
"ol’": "old",
"ought’ve": "ought have",
"oughtn’t": "ought not",
"oughtn’t’ve": "ought not have",
"’round": "around",
"’s": "is",
"shan’t": "shall not",
"she’d": "she would",
"she’ll": "she will",
"she’s": "she is",
"should’ve": "should have",
"shouldn’t": "should not",
"shouldn’t’ve": "should not have",
"somebody’s" : "somebody has",
"someone’s": "someone is",
"something’s": "something is",
"so’re": "so are",
"so’s": "so is",
"so’ve": "so have",
"that’ll": "that will",
"that’re": "that are",
"that’s": "that is",
"that’d": "that would",
"there’d": "there would",
"there’ll": "there will",
"there’re": "there are",
"there’s": "there is",
"these’re": "these are",
"these’ve": "these have",
"they’d": "they would",
"they’d've": "they would have",
"they’ll": "they will",
"they’re": "they are",
"they’ve": "they have",
"this’s": "this is",
"those’re": "those are",
"those’ve": "those have",
"’thout": "without",
"’til": "until",
"’tis": "it is",
"to’ve": "to have",
"tryna": "trying to",
"’twas": "it was",
"’tween": "between",
"’twere": "it were",
"w’all": "we all",
"w’at": "we at",
"ur": "your",
"wanna": "want to",
"wasn’t": "was not",
"we’d": "we would",
"we’d’ve": "we would have",
"we’ll": "we will",
"we’re": "we are",
"we’ve": "we have",
"weren’t": "were not",
"whatcha": 	"what are you",
"what’d": "what did",
"what’ll": "what will",
"what’re": 	"what are",
"what’s": "what is",
"what’ve": "what have",
"when’s": "when is",
"where’d": "where did",
"where’ll": "where will",
"where’re": "where are",
"where’s": 	"where is",
"where’ve": "where have",
"which’d": "which would",
"which’ll": "which will",
"which’re": "which are",
"which’s": 	"which is",
"which’ve": "which have",
"who’d": "who would",
"who’d’ve": "who would have",
"who’ll": "who will",
"who’re": "who are",
"who’s": "who is",
"who’ve": "who have",
"why’d": "why did",
"why’re": "why are",
"why’s": "why is",
"willn’t": 	"will not",
"won’t": "will not",
"wonnot": "will not",
"would’ve": "would have",
"wouldn’t": "would not",
"wouldn’t’ve": "would not have",
"y’ain’t ":	"you are not",
"y’all": "you all",
"y’all’d’ve": "you all would have",
"y’all’dn't’ve":"you all would not have",
"y’all’re": "you all are",
"y’all’ren’t": 	"you all are not",
"y’at": "you at",
"yes’m": "yes madam",
"yever": "have you ever?",
"y’know": "you know",
"yessir": "yes sir",
"you’d": "you would",
"you’ll": "you will",
"you’re": "you are",
"you’ve": "you have",
"when’d": "when did"
}

In [51]:

import contractions

x = "y'all know I wouldn't forget ur birthday y'know"

def cont_to_exp(x):
    return contractions.fix(x)

result = cont_to_exp(x)
print(result)

you all know I would not forget you are birthday y'know


In [52]:
%%timeit
df['twitts'] = df['twitts'].apply(lambda x: cont_to_exp(x))

28.6 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [53]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
3415,"@letshearitforme hehe. 2 hours as well, i love...",1,19,79,4.157895,5,0,1,1,0
680,damarques johnson vs. james wilkes showed hear...,0,20,106,5.3,5,0,0,0,2
514,tried 2 make a lasting impression....but i do ...,0,12,55,4.583333,5,0,0,1,0
2643,were here!!!! well in adrian atleast!,1,6,32,5.333333,2,0,0,0,0
2710,playing dumb really does have its advantages ....,1,13,65,5.0,6,0,0,0,1


### Count and Remove Emails

In [56]:
df[df['twitts'].str.contains('hotmail.com')]

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
3713,@securerecs arghh me please markbradbury_16@h...,1,5,51,10.2,0,0,1,0,0


In [57]:
df.iloc[3713]['twitts']

'@securerecs arghh me please  markbradbury_16@hotmail.com'

In [58]:
import re

In [61]:
x = '@securerecs arghh me please  markbradbury_16@hotmail.com'

In [62]:
re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', x)

['markbradbury_16@hotmail.com']

In [63]:
df['emails'] = df['twitts'].apply(lambda x: re.findall(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', x))

In [65]:
df['emails_count'] = df['emails'].apply(lambda x: len(x))

In [66]:
df[df['emails_count']>0]

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count,emails,email_counts,emails_count
3713,@securerecs arghh me please markbradbury_16@h...,1,5,51,10.2,0,0,1,0,0,[markbradbury_16@hotmail.com],1,1


Remove the emails

In [67]:
re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", x)

'@securerecs arghh me please  '

In [68]:
df['twitts'] = df['twitts'].apply(lambda x : re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", x))

### Count and Remove URLS

In [69]:
x = 'hi, thanks for watching. for more videos, visit https://youtube.com/xaimli or github.com/xxy'

In [70]:
re.findall(r'(http|https|ftp|ssh|)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)

[('https', 'youtube.com', '/xaimli')]

In [71]:
df['url_flags'] = df['twitts'].apply(lambda x: len(re.findall(r'(http|https|ftp|ssh|)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)))

In [73]:
df[df['url_flags']>0].sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count,emails,email_counts,emails_count,url_flags
3805,pic 11 is gerri halliwell is not it! did not k...,1,13,69,5.307692,3,0,0,1,1,[],0,0,1
3634,thinking of fuzzball more than air france htt...,1,8,60,7.5,3,0,0,0,0,[],0,0,1
3203,"@thewebguy http://twitpic.com/6jb33 - dude, th...",1,14,85,6.071429,3,0,1,0,0,[],0,0,1
1126,http://twitpic.com/7xdb1 - time to make the br...,0,11,53,4.818182,4,0,0,0,0,[],0,0,1
3313,new blog entry at http://bit.ly/a2jhg - talkin...,1,23,112,4.869565,7,0,0,0,0,[],0,0,1


In [74]:
re.sub(r'(http|https|ftp|ssh|)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','', x)

'hi, thanks for watching. for more videos, visit  or github.com/xxy'

In [75]:
df['twitts'] = df['twitts'].apply(lambda x: re.sub(r'(http|https|ftp|ssh|)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','', x))

In [76]:
df.sample(10)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count,emails,email_counts,emails_count,url_flags
1582,@maryrose_m i have seen a few turtle on the ro...,0,18,83,4.611111,8,0,1,0,1,[],0,0,0
401,morning. my throat is still sore; definitely g...,0,22,113,5.136364,6,0,0,0,0,[],0,0,0
2052,@ondasonora lol. i have a catholic church from...,1,17,72,4.235294,6,0,1,0,3,[],0,0,0
1932,@dyankd whatsup sweet heart? gile anen nih aku,0,8,39,4.875,0,0,1,0,0,[],0,0,0
310,@kelz017 i have no pictures of me were i am no...,0,12,54,4.5,5,0,1,0,1,[],0,0,0
1036,feeling the effects of muscle in a month weig...,0,20,73,3.65,9,0,0,1,0,[],0,0,0
222,ummmmm...i have done like nothing all day...wh...,0,8,47,5.875,4,0,0,0,0,[],0,0,0
140,"@_tranquilize ill keep you up, i have to stay ...",0,14,55,3.928571,8,0,1,0,0,[],0,0,0
2147,@trapes lol - how long do you walk for? do you...,1,18,67,3.722222,7,0,1,0,0,[],0,0,0
1546,the day i leave my car across the street all d...,0,30,107,3.566667,14,0,0,0,2,[],0,0,0


### Remove RT

In [None]:
df[df['twitts'].str.contains('rt')]

In [80]:
x = 'rt @username: hello hi'

In [81]:
re.sub(r'\brt\b', '', x).strip()

'@username: hello hi'

In [82]:
df['twitts'] = df['twitts'].apply(lambda x: re.sub(r'\brt\b', '', x).strip())

### Special Character or Punctuation Removal

In [83]:
df.sample(3)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count,emails,email_counts,emails_count,url_flags
1954,@rubyam i do not know what it is maybe you ca...,0,12,47,3.916667,6,0,1,0,1,[],0,0,0
3759,"@mayoryoung hey man, i had fun being on your s...",1,24,100,4.166667,12,0,1,0,1,[],0,0,0
3546,"grilled chicken, broccolli, and water. yummy ...",1,18,91,5.055556,8,0,0,0,1,[],0,0,0


In [85]:
x = '@mayoryoung hey man, i had fun being on your s...'

In [86]:
re.sub(r'[^\w ]+', '',x)

'mayoryoung hey man i had fun being on your s'

In [87]:
df['twitts'] = df['twitts'].apply(lambda x : re.sub(r'[^\w ]+', '',x))

In [49]:
"""
def cont_to_exp(x):
    if type(x) is str:
        for key in custom_contractions:
            value = custom_contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x
"""

In [47]:
"""
def preprocess_with_custom_dict(text, custom_dict):
    for key, value in custom_dict.items():
        text = text.replace(key, f"{value}_TOKEN")
    return text

def postprocess_with_custom_dict(text, custom_dict):
    for key, value in custom_dict.items():
        text = text.replace(f"{value}_TOKEN", value)
    return text

x = "y'all know I wouldn't forget ur birthday y'know"

# Preprocess: Replace custom contractions with special tokens
x_preprocessed = preprocess_with_custom_dict(x, custom_contractions)

# Use contractions library
x_expanded = contractions.fix(x_preprocessed)

# Postprocess: Replace special tokens with expanded forms
result = postprocess_with_custom_dict(x_expanded, custom_contractions)

print(result)
"""

'\ndef preprocess_with_custom_dict(text, custom_dict):\n    for key, value in custom_dict.items():\n        text = text.replace(key, f"{value}_TOKEN")\n    return text\n\ndef postprocess_with_custom_dict(text, custom_dict):\n    for key, value in custom_dict.items():\n        text = text.replace(f"{value}_TOKEN", value)\n    return text\n\nx = "y\'all know I wouldn\'t forget ur birthday y\'know"\n\n# Preprocess: Replace custom contractions with special tokens\nx_preprocessed = preprocess_with_custom_dict(x, custom_contractions)\n\n# Use contractions library\nx_expanded = contractions.fix(x_preprocessed)\n\n# Postprocess: Replace special tokens with expanded forms\nresult = postprocess_with_custom_dict(x_expanded, custom_contractions)\n\nprint(result)\n'