In [1]:
import pandas as pd
import numpy as np
import os
os.chdir("F://twitter sentiment")

In [2]:
train=pd.read_csv("train.csv")

In [3]:
#no of words
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


In [4]:
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


In [7]:
#stopwords removal
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [8]:
#counting hashtags
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [9]:
#calculating no of numerics
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [10]:
#count of uppercase
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [11]:
#transforming tweets to lowercase
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

In [12]:
#The next step is to remove punctuation, as it doesn’t add any extra information while treating text data. 
#Therefore removing all instances of it will help us reduce the size of the training data.

train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [14]:
#Similarly, just as we removed the most common words, this time let’s remove rarely occurring words from the text. Because they’re so rare, the association between them and other words is dominated by noise. 
#You can replace rare words with a more general form and then this will have higher counts
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq

lularoe           1
ltbreakfast       1
urselflove        1
enable            1
kitchenâ          1
noelcastanza      1
hassle            1
onlymoneytalks    1
kidsrehab         1
wipe              1
dtype: int64

In [15]:
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [17]:
#spelling correction is a useful pre-processing step because this also will help us in reducing multiple copies of words
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    user father dysfunctional selfish drags kiss d...
1    user user thanks left credit can use cause don...
2                                       midday majesty
3                 model love u take u time or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [18]:
#Tokenization refers to dividing the text into a sequence of words or sentences. In our example, we have used the textblob library to first transform our tweets into a blob and then converted them into a series of words.
TextBlob(train['tweet'][1]).words

WordList(['user', 'user', 'thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

In [19]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    user father dysfunct selfish drag kid dysfunct...
1    user user thank lyft credit cant use caus dont...
2                                       bihday majesti
3                model love u take u time urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [20]:
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

0    user father dysfunctional selfish drag kid dys...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [21]:
#ngrams
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['user', 'father']),
 WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [22]:
#Term frequency is simply the ratio of the count of a word present in a sentence, to the length of the sentence.

tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,user,2
1,credit,1
2,offer,1
3,wheelchair,1
4,dont,1
5,use,1
6,cause,1
7,getthanked,1
8,cant,1
9,thanks,1


In [23]:
#The intuition behind inverse document frequency (IDF) is that a word is not of much use to us if it’s appearing in all the documents.

#Therefore, the IDF of each word is the log of the ratio of the total number of rows to the number of rows in which that word is present.

#IDF = log(N/n), 

for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,user,2,1.054454
1,credit,1,7.327781
2,offer,1,6.522155
3,wheelchair,1,9.273691
4,dont,1,3.745585
5,use,1,1.00667
6,cause,1,5.690172
7,getthanked,1,9.679156
8,cant,1,3.538194
9,thanks,1,4.597751


In [24]:
#TF-IDF is the multiplication of the TF and IDF which we calculated above.

tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,user,2,1.054454,2.108909
1,credit,1,7.327781,7.327781
2,offer,1,6.522155,6.522155
3,wheelchair,1,9.273691,9.273691
4,dont,1,3.745585,3.745585
5,use,1,1.00667,1.00667
6,cause,1,5.690172,5.690172
7,getthanked,1,9.679156,9.679156
8,cant,1,3.538194,3.538194
9,thanks,1,4.597751,4.597751


In [25]:
#our problem was to detect the sentiment of the tweet. So, before applying any ML/DL models (which can have a separate feature detecting the sentiment using the textblob library), let’s check the sentiment of the first few tweets.
train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)
#Above, you can see that it returns a tuple representing polarity and subjectivity of each tweet. Here, we only extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1 means a negative sentiment

0    (-0.3, 0.5354166666666667)
1                    (0.2, 0.2)
2                    (0.0, 0.0)
3                    (0.5, 0.6)
4                    (0.0, 0.0)
Name: tweet, dtype: object