In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [4]:
# Clean data( remove punctuations, url and spaces)

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("\*+", "<CURSE>", text)
    return text

In [5]:
train_df[train_df.text.isna()]                   #Check for NULL values

Unnamed: 0,textID,text,selected_text,sentiment
314,fdb77c3752,,,neutral


In [6]:
train_df.dropna(axis = 0, inplace=True)          #Drop Null values

In [7]:
#Clean text and selected_text column

train_df['text'] = train_df['text'].apply(lambda x: clean_text(x))
train_df['selected_text'] = train_df['selected_text'].apply(lambda x: clean_text(x))

In [8]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,id have responded if i were going,neutral
1,549e992a42,sooo sad i will miss you here in san diego,sooo sad,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,sons of why couldnt they put them on the rel...,sons of,negative


In [9]:
#Check all the rows where selected_text is empty

train_df[train_df.selected_text=='']

Unnamed: 0,textID,text,selected_text,sentiment
796,233b90417d,i never knew i could miss my phone so much fo...,,negative
891,584ed4d88e,not going to the dance recital and now i feel ...,,negative
962,ae1b6151f2,facebook is being a,,negative
1044,ecf25bb855,i hope theyr all wrong we need him,,negative
1759,89e7895985,apple store is down for updates whats coming...,,negative
...,...,...,...,...
25919,93fa13fa5b,my sisters pc just blued screened me,,negative
26005,0b3fe0ca78,,,neutral
26371,1d1e147bed,oh fack u gave me police thats some seriouse,,negative
27360,64315d7fad,so im at the point now that it has become appa...,,negative


In [10]:
#Check all the rows where text column is empty

train_df[train_df.text=='']

Unnamed: 0,textID,text,selected_text,sentiment
1319,bc84f21e3b,,shoesshoesshoesyayyayyayloli,positive
6399,d02b3284fd,,,neutral


In [11]:
#Drop all the rows where text or selected_text column is empty

train_df.drop(labels = train_df[train_df.text==''].index, inplace=True)
train_df.drop(labels = train_df[train_df.selected_text==''].index, inplace=True)

In [12]:
#Function for getting misspelled words from selected_text. 

def wrong_words(x):
    text, sel_text = x[0], x[1]
    wrg_words = []
    for word in sel_text.split():
        if word not in text.split():
            wrg_words.append(word)
    if len(wrg_words)>0:
        return (" ".join(wrg_words))
    else:
        return ("NO ERRORS")

In [13]:
train_df['misspelled'] = train_df[['text', 'selected_text']].apply(wrong_words, axis=1)

In [14]:
train_df[train_df.misspelled != "NO ERRORS"]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,onna
27,bdc32ea43c,on the way to malaysiano internet access to twit,no internet,negative,no
32,1c31703aef,if it is any consolation i got my bmi tested ...,well so much for being unhappy for about minute,negative,minute
39,2863f435bd,a little happy for the wine jeje ok itsm my fr...,a little happy fo,positive,fo
48,3d9d4b0b55,i donbt like to peel prawns i also dont like g...,dont like go,negative,go
...,...,...,...,...,...
27426,132e051fe8,my cousins moved there like years ago and i...,m sad,negative,m
27456,d32efe060f,i wanna leave work already not feelin it,wanna leave work al,negative,al
27470,778184dff1,lol i know and hahadid you fall asleep or ju...,t bored,negative,t
27474,8f14bb2715,so i get up early and i feel good about the da...,i feel good ab,positive,ab


In [15]:
train_df.head(30)

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
0,cb774db0d1,id have responded if i were going,id have responded if i were going,neutral,NO ERRORS
1,549e992a42,sooo sad i will miss you here in san diego,sooo sad,negative,NO ERRORS
2,088c60f138,my boss is bullying me,bullying me,negative,NO ERRORS
3,9642c003ef,what interview leave me alone,leave me alone,negative,NO ERRORS
4,358bd9e861,sons of why couldnt they put them on the rel...,sons of,negative,NO ERRORS
5,28b57f3990,some shameless plugging for the best rangers...,some shameless plugging for the best rangers...,neutral,NO ERRORS
6,6e0c6d75b1,feedings for the baby are fun when he is all ...,fun,positive,NO ERRORS
7,50e14c0bb8,soooo high,soooo high,neutral,NO ERRORS
8,e050245fbd,both of you,both of you,neutral,NO ERRORS
9,fc2cbefa9d,journey wow u just became cooler hehe is tha...,wow u just became cooler,positive,NO ERRORS


In [16]:
#Get all the rows where selected_text has error and contains single character as misspelled words. 

train_df[train_df['misspelled'].apply(lambda x: len(x)) == 1]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
49,3fcea4debc,which case i got a new one last week and im n...,d im not thrilled at all with mine,negative,d
66,95e12b1cb1,hes awesome have you worked with him before ...,s awesome,positive,s
129,94f67cfa6d,hey mia totally adore your music when will ...,y adore,positive,y
134,6903cb08f2,nice to see you tweeting its sunday may and...,e nice,positive,e
166,c78bf59e67,lichfield tweetup sounds like fun hope to s...,p sounds like fun,positive,p
...,...,...,...,...,...
27153,a044ed928d,enjoy nola definitely one of my favorite citi...,y one of my favorite cities in the world,positive,y
27240,40143b692e,who knows it makes me sad lol,e sad,negative,e
27426,132e051fe8,my cousins moved there like years ago and i...,m sad,negative,m
27470,778184dff1,lol i know and hahadid you fall asleep or ju...,t bored,negative,t


In [17]:
# Remove all the single character words from selected_text

def clean_sel_text(x):
    text, sel_text, misspelled = x[0], x[1], x[2]
    sel_text = sel_text.split()
    sel_text.remove(misspelled)
    return " ".join(sel_text)

In [18]:
train_df['selected_text'] = train_df[['text', 'selected_text', 'misspelled']].apply(lambda x: clean_sel_text(x) if len(x['misspelled'])==1 else x['selected_text'], axis=1)

In [19]:
train_df[train_df.misspelled != "NO ERRORS"]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,onna
27,bdc32ea43c,on the way to malaysiano internet access to twit,no internet,negative,no
32,1c31703aef,if it is any consolation i got my bmi tested ...,well so much for being unhappy for about minute,negative,minute
39,2863f435bd,a little happy for the wine jeje ok itsm my fr...,a little happy fo,positive,fo
48,3d9d4b0b55,i donbt like to peel prawns i also dont like g...,dont like go,negative,go
...,...,...,...,...,...
27426,132e051fe8,my cousins moved there like years ago and i...,sad,negative,m
27456,d32efe060f,i wanna leave work already not feelin it,wanna leave work al,negative,al
27470,778184dff1,lol i know and hahadid you fall asleep or ju...,bored,negative,t
27474,8f14bb2715,so i get up early and i feel good about the da...,i feel good ab,positive,ab


In [20]:
#Get all the rows having error, misspelled words with length>1 and sentiment is positive.

m = train_df[train_df['misspelled'].apply(lambda x: len(x)) > 1]

m[(m.misspelled!="NO ERRORS") & (m.sentiment=="positive")]


Unnamed: 0,textID,text,selected_text,sentiment,misspelled
39,2863f435bd,a little happy for the wine jeje ok itsm my fr...,a little happy fo,positive,fo
149,11c10e3fc5,shes unassuming and unpretentious shes just a...,endearing,positive,endearing
168,e56eeb4968,few bevvies in twngreat on a day off,great,positive,great
240,7df7221145,thats why i need to be thereto represent the...,thats why i need to be there,positive,there
309,a54d3c2825,i know it was worth a shot though,as wort,positive,as wort
...,...,...,...,...,...
27339,27829d441f,just got out of the pool so funnow gonna wat...,so fun,positive,fun
27376,13fbc75291,going to church in the morninghappy mommas day...,happy,positive,happy
27386,e149ebd3a1,would one of the vwllers want to add this eve...,ch appreciat,positive,ch appreciat
27401,261e064dd4,oh silence verona i am wanting to go jaja ...,ja enjoyyitverymu,positive,ja enjoyyitverymu


In [21]:
#Get all the rows having error, misspelled words with length>1 and sentiment is negative.

m[(m.misspelled!="NO ERRORS") & (m.sentiment=="negative")]


Unnamed: 0,textID,text,selected_text,sentiment,misspelled
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,onna
27,bdc32ea43c,on the way to malaysiano internet access to twit,no internet,negative,no
32,1c31703aef,if it is any consolation i got my bmi tested ...,well so much for being unhappy for about minute,negative,minute
48,3d9d4b0b55,i donbt like to peel prawns i also dont like g...,dont like go,negative,go
60,52483f7da8,i lost all my friends im alone and sleepyi wan...,i lost all my friends im alone and sleepy,negative,sleepy
...,...,...,...,...,...
27209,30a1e8c2b4,guys i know my ability to read time telli...,es faile,negative,es faile
27280,e77145c41c,what happened are you suffering from necksho...,i cant move eithe,negative,eithe
27302,90c8aa60db,have i ever told you i absolutly hate writing ...,hate wr,negative,wr
27362,7b82d63ee4,just found out i wont be tweeting from my phon...,c sorr,negative,c sorr


In [22]:
#Get all the rows having error, misspelled words with length>1 and sentiment is neutral.

m[(m.misspelled!="NO ERRORS") & (m.sentiment=="neutral")]


Unnamed: 0,textID,text,selected_text,sentiment,misspelled
251,77ba0fee75,powerblog what is this powerblog challenge y...,g what is this powerblog challenge you keep ta...,neutral,g followe
366,b751f39570,yea i should know but tell me everything ps ...,yea i should know but tell me everything ps s...,neutral,ha
581,1fce3e2d7b,no bueno hollykins needs to feel better asap...,no bueno hollykins needs to feel better asap ...,neutral,soproudofyo
637,b04bbb81c9,hi to one kiwi artist from another kiwi artist,hi to one kiwi artist from another kiwi artis,neutral,artis
639,289317384e,oooo ok why havent you accepted my friends...,oooo ok why havent you accepted my friends ...,neutral,reque
...,...,...,...,...,...
26870,ff77427519,will do it in a couple od days when i have ...,will do it in a couple od days when i have ...,neutral,pe
26882,336b9cfc93,xdxdxd you crazy little thing why didnï¿½t ...,dxd you crazy little thing why didnï¿½t you ge...,neutral,dxd
27067,c2dfc5875c,jamie sean cody up for some angry jamie sea...,amie sean cody up for some angry jamie sean...,neutral,amie
27111,44cf670176,haha itll be gross by the time it comes back ...,haha itll be gross by the time it comes back ...,neutral,india


In [23]:
#Fuzzywuzzy library uses Levenshtein Distance for calculating difference between two sequences.

from fuzzywuzzy import fuzz



In [24]:
# Replace misspelled words with the words in text which has similarity of more than cutoff.

def check_fuzzy(x, cutoff):
    text, sel_text, misspelled = x[0], x[1], x[2]
    sel_text = sel_text.split()
    text =text.split()
    misspelled = misspelled.split()
    for m in misspelled:
        for t in text:
            if m in sel_text:
                if (fuzz.ratio(t,m)>cutoff):
                    index = sel_text.index(m)
                    
                    sel_text[index]= t
    return (" ".join(sel_text))

In [25]:
train_df[(train_df.misspelled!="NO ERRORS")]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
18,af3fed7fc3,is back home now gonna miss every one,onna,negative,onna
27,bdc32ea43c,on the way to malaysiano internet access to twit,no internet,negative,no
32,1c31703aef,if it is any consolation i got my bmi tested ...,well so much for being unhappy for about minute,negative,minute
39,2863f435bd,a little happy for the wine jeje ok itsm my fr...,a little happy fo,positive,fo
48,3d9d4b0b55,i donbt like to peel prawns i also dont like g...,dont like go,negative,go
...,...,...,...,...,...
27426,132e051fe8,my cousins moved there like years ago and i...,sad,negative,m
27456,d32efe060f,i wanna leave work already not feelin it,wanna leave work al,negative,al
27470,778184dff1,lol i know and hahadid you fall asleep or ju...,bored,negative,t
27474,8f14bb2715,so i get up early and i feel good about the da...,i feel good ab,positive,ab


In [26]:
#Replace misspelled words with the words in text which has similarity of more than 50.


train_df['selected_text']= train_df[['text', 'selected_text', 'misspelled']].apply(lambda x: check_fuzzy(x, 50) if (x['misspelled'])!= "NO ERRORS" else x["selected_text"], axis =1)

In [27]:
train_df[(train_df.misspelled!="NO ERRORS")]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
18,af3fed7fc3,is back home now gonna miss every one,gonna,negative,onna
27,bdc32ea43c,on the way to malaysiano internet access to twit,no internet,negative,no
32,1c31703aef,if it is any consolation i got my bmi tested ...,well so much for being unhappy for about minutes,negative,minute
39,2863f435bd,a little happy for the wine jeje ok itsm my fr...,a little happy for,positive,fo
48,3d9d4b0b55,i donbt like to peel prawns i also dont like g...,dont like going,negative,go
...,...,...,...,...,...
27426,132e051fe8,my cousins moved there like years ago and i...,sad,negative,m
27456,d32efe060f,i wanna leave work already not feelin it,wanna leave work al,negative,al
27470,778184dff1,lol i know and hahadid you fall asleep or ju...,bored,negative,t
27474,8f14bb2715,so i get up early and i feel good about the da...,i feel good about,positive,ab


In [28]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
0,cb774db0d1,id have responded if i were going,id have responded if i were going,neutral,NO ERRORS
1,549e992a42,sooo sad i will miss you here in san diego,sooo sad,negative,NO ERRORS
2,088c60f138,my boss is bullying me,bullying me,negative,NO ERRORS
3,9642c003ef,what interview leave me alone,leave me alone,negative,NO ERRORS
4,358bd9e861,sons of why couldnt they put them on the rel...,sons of,negative,NO ERRORS
...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,lost,negative,d
27477,4f4c4fc327,ive wondered about rake to the client has ma...,dont force,negative,NO ERRORS
27478,f67aae2310,yay good for both of you enjoy the break you...,yay good for both of you,positive,NO ERRORS
27479,ed167662a5,but it was worth it,but it was worth it,positive,NO ERRORS


In [29]:
#After all this processing, it is possible that many words in selected_text change to single character word or any other misspelled word is present,
#So, drop "misspelled" column and repeat the above cleaning steps again.
train_df =train_df.drop(columns=['misspelled'])

In [30]:
train_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,id have responded if i were going,id have responded if i were going,neutral
1,549e992a42,sooo sad i will miss you here in san diego,sooo sad,negative
2,088c60f138,my boss is bullying me,bullying me,negative
3,9642c003ef,what interview leave me alone,leave me alone,negative
4,358bd9e861,sons of why couldnt they put them on the rel...,sons of,negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on denver husband l...,lost,negative
27477,4f4c4fc327,ive wondered about rake to the client has ma...,dont force,negative
27478,f67aae2310,yay good for both of you enjoy the break you...,yay good for both of you,positive
27479,ed167662a5,but it was worth it,but it was worth it,positive


In [31]:
train_df['misspelled'] = train_df[['text', 'selected_text']].apply(wrong_words, axis=1)

In [32]:
m = train_df[train_df.misspelled!="NO ERRORS"]
m[m['misspelled'].apply(lambda x: len(x)) > 1]


Unnamed: 0,textID,text,selected_text,sentiment,misspelled
27,bdc32ea43c,on the way to malaysiano internet access to twit,no internet,negative,no
223,242d92151a,walking to class i hate not having a bikeespec...,i hate not having a bike,negative,bike
362,b94aaf845e,please review sunehre ad placement,please re,positive,re
569,03f9f6f798,i dont think ive ever been so tierd in my life...,i dont think ive ever been so tierd in my lifeu,negative,lifeu
604,15f47296e2,soooim kinda o sick n tired of the bs that guy...,im kinda o sick n tired,negative,im
...,...,...,...,...,...
26687,c5de5361d2,does anyone out there want to be really awesom...,does anyone out there want to be really awesom...,neutral,ht
26883,ee30bf8953,im yours jason mrazlooking for an electric gu...,yay,positive,yay
27209,30a1e8c2b4,guys i know my ability to read time telli...,es failed,negative,es
27302,90c8aa60db,have i ever told you i absolutly hate writing ...,hate wr,negative,wr


In [33]:
train_df['selected_text'] = train_df[['text', 'selected_text', 'misspelled']].apply(lambda x: clean_sel_text(x) if len(x['misspelled'])==1 else x['selected_text'], axis=1)

In [34]:
train_df['selected_text']= train_df[['text', 'selected_text', 'misspelled']].apply(lambda x: check_fuzzy(x,30) if (x['misspelled'])!= "NO ERRORS" else x["selected_text"], axis =1)

In [35]:
train_df =train_df.drop(columns=['misspelled'])

In [36]:
train_df['misspelled'] = train_df[['text', 'selected_text']].apply(wrong_words, axis=1)

In [37]:
train_df[train_df.misspelled!="NO ERRORS"]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled
1588,a7f72a928a,woooooooooo are you coming to nottingham at...,to lovelovelove,positive,lovelovelove
5697,e8c90dee68,im kinda bored anyone else i think ill listen ...,im kinda bored anyone else i think ill listen ...,neutral,ww
7410,3463ecdfd6,imintheroom imwatchingthehannahmoviewithmomshe...,great,positive,great
16201,e78c1ad3f5,off to work off at,lammmeeee,negative,lammmeeee
25293,2fdbe40c03,grreverytime he gets a new girlfriendim at the...,im at the bottom of the totem pole,negative,im


In [38]:
#Still, there are few records are remaining which has some error in selected_text.
#But we have only 4-5 rows like this, so we can correct that manually.

In [39]:
train_df.loc[1588].selected_text = 'woooooooooo'
train_df.loc[5697].selected_text = 'im kinda bored'
train_df.loc[7410].text = 'i m in the room im watching the hannah movie with mom, she said this film is very great .'
train_df.loc[16201].selected_text = 'off to work'
train_df.loc[25293].selected_text = 'at the bottom of the totem pole'

In [40]:
train_df =train_df.drop(columns=['misspelled'])

In [41]:
#Again check for any misspelled words.
train_df['misspelled'] = train_df[['text', 'selected_text']].apply(wrong_words, axis=1)

In [42]:
# No errors in selected_text now.
train_df[train_df.misspelled!="NO ERRORS"]

Unnamed: 0,textID,text,selected_text,sentiment,misspelled


In [43]:
train_df =train_df.drop(columns=['misspelled'])

In [44]:
#Save cleaned file as csv
train_df.to_csv('cleaned_data.csv', index=False)