## Text cleaning and Preprocessing

In [2]:
pip install spacy

Collecting spacyNote: you may need to restart the kernel to use updated packages.

  Downloading spacy-3.7.2-cp310-cp310-win_amd64.whl (12.1 MB)
     ---------------------------------------- 0.0/12.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.1 MB ? eta -:--:--
     --------------------------------------- 0.0/12.1 MB 660.6 kB/s eta 0:00:19
     --------------------------------------- 0.0/12.1 MB 660.6 kB/s eta 0:00:19
     --------------------------------------- 0.0/12.1 MB 219.4 kB/s eta 0:00:55
     --------------------------------------- 0.1/12.1 MB 403.5 kB/s eta 0:00:30
     --------------------------------------- 0.1/12.1 MB 469.7 kB/s eta 0:00:26
     --------------------------------------- 0.1/12.1 MB 448.2 kB/s eta 0:00:27
      -------------------------------------- 0.2/12.1 MB 477.7 kB/s eta 0:00:25
      -------------------------------------- 0.2/12.1 MB 551.4 kB/s eta 0:00:22
      -------------------------------------- 0.2/12.1 MB 554.9 kB/s eta


[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: C:\Users\user\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import spacy

In [4]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitter4000.csv',encoding= 'latin-1')

In [6]:
df

Unnamed: 0,twitts,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0
...,...,...
3995,i just graduated,1
3996,Templating works; it all has to be done,1
3997,mommy just brought me starbucks,1
3998,@omarepps watching you on a House re-run...lov...,1


In [7]:
df['sentiment'].value_counts()

sentiment
0    2000
1    2000
Name: count, dtype: int64

### Word Counts

In [8]:
df['word_counts'] = df['twitts'].apply(lambda x: len(str(x).split()))

In [9]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts
3685,@lilithsaintcrow You are so cool. I wanna get ...,1,12
2978,@RealitynTails You are welcome for the #ff sho...,1,9
1143,@xxAleksxx aww well yeah at least ya won't be ...,0,25
448,I'm bout 2go smoke. I'm not an addict. I just ...,0,28
2200,"@dannygokey: hey, how r u today? i hope u're d...",1,19


In [10]:
df['word_counts'].max()

32

In [11]:
df['word_counts'].min()

1

In [12]:
df['word_counts'] == 1

0       False
1       False
2       False
3       False
4       False
        ...  
3995    False
3996    False
3997    False
3998    False
3999    False
Name: word_counts, Length: 4000, dtype: bool

In [13]:
df[df['word_counts']==1]

Unnamed: 0,twitts,sentiment,word_counts
385,homework,0,1
691,@ekrelly,0,1
1124,disappointed,0,1
1286,@officialmgnfox,0,1
1325,headache,0,1
1897,@MCRmuffin,0,1
2542,Graduated!,1,1
2947,reading,1,1
3176,@omeirdeleon,1,1
3470,www.myspace.com/myfinalthought,1,1


### Character Counts

In [14]:
def char_counts(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

In [15]:
char_counts('I am wide awake')

12

In [16]:
df['char_counts'] = df['twitts'].apply(lambda x: char_counts(str(x)))

In [17]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts
1990,I FEEL SO..... REPLACED.,0,4,21
1012,@lisalent I am thinking of putting together a...,0,22,113
695,I don't wanna have exams,0,5,20
3786,@JPatrickDowning try Price Cutters.,1,4,32
1257,@theblish Socials,0,2,16


### Average Word Length

In [18]:
df['avg_word_len'] = df['char_counts']/df['word_counts']

In [19]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len
1991,@cherrywopie korean dramas? arent you in japan...,0,22,100,4.545455
3120,"@EmmoLei Oh that's great, enjoy",1,5,27,5.4
3694,Chilln with sj folks &amp;Jessica &amp;Christi...,1,19,100,5.263158
3218,RunsHouse is back.. .thank sweet jesus lol !,1,8,37,4.625
1859,I think Im gonna lay out in the sun a bit then...,0,17,61,3.588235


### Stop Words Count

In [20]:
print(stopwords)

{'herein', 'ever', 'had', 'she', 'enough', 'when', 'therein', 'whether', '‘d', 'but', 'used', 'hereby', 'none', 'of', 'nor', 'yourselves', '‘re', 'you', 'eleven', 'else', 'two', 'him', 'this', 'anyway', 'four', 'afterwards', 'here', 'the', 'various', 'around', 'becomes', 'from', 'unless', 'further', 'otherwise', 'third', 'therefore', 'me', 'am', 'same', 're', 'which', 'own', 'fifteen', 'where', 'her', 'others', 'per', 'without', 'latter', 'so', 'serious', 'seems', 'whose', 'namely', 'might', 'really', 'whither', 'yours', 'hundred', 'keep', 'hereupon', 'too', 'more', 'also', 'ten', 'still', 'should', 'to', 'being', 'via', 'onto', 'off', 'their', '‘m', "n't", 'how', 'whereupon', 'noone', 'thus', 'no', 'across', 'many', 'less', 'even', 'again', 'except', 'least', 'give', 'why', 'never', 'nine', 'each', 'always', 'your', 'see', 'while', 'until', "'m", 'nobody', 'cannot', 'over', 'ours', 'once', 'seeming', 'since', 'beside', 'put', 'eight', 'anything', 'very', 'beforehand', 'other', 'first'

In [21]:
len(stopwords)

326

In [22]:
df['stop_words_len'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t in stopwords]))

### #Hashtags and @Mentions Count

In [23]:
# [t for t in x.split() if t.starstwith('#')]

In [24]:
df['hashtags_count'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t.startswith('#')]))

In [25]:
df['mentions_count'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))

In [26]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count
3265,YAY!!! internet is back on my laptop,1,7,30,4.285714,4,0,0
665,i have crashd my iphone i put this on itunes ...,0,18,78,4.333333,10,0,0
2621,@jordanknight Good knight hun! hope Jon gave y...,1,13,64,4.923077,2,0,1
709,@IzaArtillero i'm not planning to study in Sta...,0,21,106,5.047619,4,0,1
3833,@beebo_wallace Good Night Beebo!,1,4,29,7.25,0,0,1


### Numeric Digits Count

In [27]:
x = 'this is 1 and 2'
x.split()

['this', 'is', '1', 'and', '2']

In [28]:
x.split()[4].isdigit()

True

In [29]:
[t for t in x.split() if t.isdigit()]

['1', '2']

In [30]:
df['numeric_count'] = df['twitts'].apply(lambda x : len([t for t in x.split() if t.isdigit()]))

In [31]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count
3022,"@joelfreak Sure, just don't tell 'em who you g...",1,15,64,4.266667,5,0,1,0
1693,Today hasn't gone off on the right foot at all...,0,22,83,3.772727,10,0,0,0
3942,@mileycyrus great miley!!,1,3,23,7.666667,0,0,1,0
1920,i want to call to selena but i dont live in usa,0,12,36,3.0,7,0,0,0
2867,Happy mothers day to all the cute moms out there,1,10,39,3.9,5,0,0,0


### Upper Case Words Count

In [32]:
x = "I GOT THE JOB"
y = "I got the job"

In [33]:
[t for t in x.split() if t.isupper()]

['I', 'GOT', 'THE', 'JOB']

In [34]:
[t for t in y.split() if t.isupper()]

['I']

In [35]:
df['upper_count'] = df['twitts'].apply(lambda x: len([t for t in x.split() if t.isupper()]))

In [36]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
817,is it only so hot in PetachTikva? yesterday i ...,0,20,93,4.65,11,0,0,0,0
140,"@_tranquilize ill keep you up, i have to stay ...",0,14,55,3.928571,8,0,1,0,0
1041,@iyacastillo hey ya iya! miss you!,0,6,29,4.833333,0,0,1,0,0
3282,For the folks that missed me http://twitpic.c...,1,7,47,6.714286,3,0,0,0,0
329,time geos by soo fast...another week of schooll,0,8,40,5.0,2,0,0,0,0


In [37]:
df.iloc[483]['twitts']

"Hear it's raining in SF  Oh well, we will probably go the the Eagle for beer bust anyway."

### Lower Case Conversion

In [38]:
df['twitts'] = df['twitts'].apply(lambda x: str(x).lower())

In [39]:
df.sample(5)

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
3971,@rainbowsoulpoet thank you,1,3,24,8.0,1,0,1,0,0
3962,@davidarchie our local shows love tributes too...,1,19,97,5.105263,3,0,1,0,1
508,says woooh! its raining.. http://plurk.com/p/...,0,5,46,9.2,1,0,0,0,0
2550,dans le carrefour http://mobypicture.com/?rqudgw,1,4,45,11.25,0,0,0,0,0
2628,"@toitokyo ok, then what happens?",1,5,28,5.6,2,0,1,0,0


## Contraction to Expansion

In [40]:
# x = "don't shouldn't, i'll "  # do not should not i will

In [41]:
contractions = {
"a’ight": "alright",
"ain’t" : "i am not",
"amn’t": "am not",
"n": "and",
"arencha" : "are not you",
"aren’t": "are not",
"’bout": "about",
"boy's": "boy has",
"can’t": "cannot",
"cap’n": "captain",
"’cause" : "because",
"cuz": "because",
"’cept": "except",
"could’ve": "could have",
"couldn’t": "could not",
"couldn’t’ve" :	"could not have",
"cuppa" : "cup of",
"daren’t" : "dare not",
"daresn’t" : "dare not",
"dasn’t" : "dare not",
"didn’t": "did not",
"doesn't": "does not",
"don’t": "do not",
"dunno"	: "do not know",
"d’ye" : "do you",
"d’ya": "did you",
"e’en":	"even",
"e’er": "ever",
"’em": "them",
"everybody’s": "everybody is",
"everyone’s": "everyone is",
"everything's": "everything is",
"finna": "fixing to",
"fo’c’sle": "forecastle",
"’gainst" : "against",
"g’day":"good day",
"gimme": "give me",
"girl's": "girl is",
"giv’n": "given",
"gi’z": "give us",
"gonna":"going to",
"gon’t": "go not",
"gotta": "got to",
"guy's": "guy is",
"hadn’t" :	"had not",
"had’ve": "had have",
"hasn’t": "has not",
"haven’t":	"have not",
"he’d": "he had",
"he’d": "he would",
"he'll": "he will",
"helluva": "hell of a",
"yes'nt":" yes not",
"he’s ": "he is",
"here’s": "here is",
"how’d": "how did",
"howdy": "how do you do",
"how’ll": "how shall",
"how’re": "how are",
"how’s": "how is",
"i’d":	"I would",
"i’d’ve": "I would have",
"i’d’nt": "I would not",
"i’d’nt’ve": "I would not have",
"if’n": "If and when",
"i’ll": "I will",
"i’m": "I am",
"imma": "I am going to",
"i’mo": "I am going to",
"innit": "isn’t it",
"ion": "I do not",
"i’ve":	"I have",
"isn’t":" is not",
"it’d": "it would",
"it’ll": "it will",
"it’s": "it is",
"idunno": "I don’t know",
"kinda": "kind of",
"let’s": "let us",
"loven’t": "love not",
"ma’am": "madam",
"mayn’t": "may not",
"may’ve": "may have",
"methinks": "I think",
"mightn’t": "might not",
"might’ve": "might have",
"mine’s": "mine is",
"mustn’t": "must not",
"mustn’t’ve": "must not have",
"must’ve": "must have",
"’neath": "beneath",
"needn’t": "need not",
"nal": "and all",
"ne’er": "never",
"o’er":	"over",
"ol’": "old",
"ought’ve": "ought have",
"oughtn’t": "ought not",
"oughtn’t’ve": "ought not have",
"’round": "around",
"’s": "is",
"shan’t": "shall not",
"she’d": "she would",
"she’ll": "she will",
"she’s": "she is",
"should’ve": "should have",
"shouldn’t": "should not",
"shouldn’t’ve": "should not have",
"somebody’s" : "somebody has",
"someone’s": "someone is",
"something’s": "something is",
"so’re": "so are",
"so’s": "so is",
"so’ve": "so have",
"that’ll": "that will",
"that’re": "that are",
"that’s": "that is",
"that’d": "that would",
"there’d": "there would",
"there’ll": "there will",
"there’re": "there are",
"there’s": "there is",
"these’re": "these are",
"these’ve": "these have",
"they’d": "they would",
"they’d've": "they would have",
"they’ll": "they will",
"they’re": "they are",
"they’ve": "they have",
"this’s": "this is",
"those’re": "those are",
"those’ve": "those have",
"’thout": "without",
"’til": "until",
"’tis": "it is",
"to’ve": "to have",
"tryna": "trying to",
"’twas": "it was",
"’tween": "between",
"’twere": "it were",
"w’all": "we all",
"w’at": "we at",
"u": "you",
"ur": "your",
"wanna": "want to",
"wasn’t": "was not",
"we’d": "we would",
"we’d’ve": "we would have",
"we’ll": "we will",
"we’re": "we are",
"we’ve": "we have",
"weren’t": "were not",
"whatcha": 	"what are you",
"what’d": "what did",
"what’ll": "what will",
"what’re": 	"what are",
"what’s": "what is",
"what’ve": "what have",
"when’s": "when is",
"where’d": "where did",
"where’ll": "where will",
"where’re": "where are",
"where’s": 	"where is",
"where’ve": "where have",
"which’d": "which would",
"which’ll": "which will",
"which’re": "which are",
"which’s": 	"which is",
"which’ve": "which have",
"who’d": "who would",
"who’d’ve": "who would have",
"who’ll": "who will",
"who’re": "who are",
"who’s": "who is",
"who’ve": "who have",
"why’d": "why did",
"why’re": "why are",
"why’s": "why is",
"willn’t": 	"will not",
"won’t": "will not",
"wonnot": "will not",
"would’ve": "would have",
"wouldn’t": "would not",
"wouldn’t’ve": "would not have",
" y’ain’t ":	"you are not",
"y’all": "you all",
"y’all’d’ve": "you all would have",
"y’all’dn't’ve":"you all would not have",
"y’all’re": "you all are",
"y’all’ren’t": 	"you all are not",
"y’at": "you at",
"yes’m": "yes madam",
"yever": "have you ever?",
"y’know": "you know",
"yessir": "yes sir",
"you’d": "you would",
"you’ll": "you will",
"you’re": "you are",
"you’ve": "you have",
"when’d": "when did"
}

In [42]:
x = " i'm don't he'll "

In [43]:
"""
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x.strip()
    else:
        return x
        """

'\ndef cont_to_exp(x):\n    if type(x) is str:\n        for key in contractions:\n            value = contractions[key]\n            x = x.replace(key, value)\n        return x.strip()\n    else:\n        return x\n        '

In [44]:
"""
def cont_to_exp(x):
    if type(x) is str:
        x_lower = x.lower()
        for key in contractions:
            key_lower = key.lower()
            value = contractions[key]
            x_lower = x_lower.replace(key_lower, value)
        return x_lower
    else:
        return x
    
"""

'\ndef cont_to_exp(x):\n    if type(x) is str:\n        x_lower = x.lower()\n        for key in contractions:\n            key_lower = key.lower()\n            value = contractions[key]\n            x_lower = x_lower.replace(key_lower, value)\n        return x_lower\n    else:\n        return x\n    \n'

In [45]:
def cont_to_exp(x):
    if type(x) is str:
        x_lower = x.lower()
        for key in contractions:
            key_lower = key.lower()
            value = contractions[key]
            x_lower = x_lower.replace(key_lower, value)
        return x_lower
    else:
        return x

In [46]:
cont_to_exp(x)

" i'm doand't he will "

In [47]:
df

Unnamed: 0,twitts,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtags_count,mentions_count,numeric_count,upper_count
0,is bored and wants to watch a movie any sugge...,0,10,43,4.300000,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,7,32,4.571429,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,12,54,4.500000,0,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,6,27,4.500000,3,0,0,0,0
4,@mandagoforth me bad! it's funny though. zacha...,0,26,116,4.461538,13,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...
3995,i just graduated,1,3,14,4.666667,2,0,0,0,0
3996,templating works; it all has to be done,1,8,32,4.000000,6,0,0,0,0
3997,mommy just brought me starbucks,1,5,27,5.400000,2,0,0,0,0
3998,@omarepps watching you on a house re-run...lov...,1,8,45,5.625000,3,0,1,0,0
