In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('train_tweets.csv')
test = pd.read_csv('test_tweets.csv')

In [None]:
train.columns

Index(['id', 'label', 'tweet'], dtype='object')

In [None]:
#Number of words
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(' ')))
train[['tweet','word_count']]

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8
...,...,...
31957,ate @user isz that youuu?ðððððð...,6
31958,to see nina turner on the airwaves trying to...,25
31959,listening to sad songs on a monday morning otw...,15
31960,"@user #sikh #temple vandalised in in #calgary,...",13


In [None]:
#Number of characters
train['char_count'] = train['tweet'].str.len()
train[['tweet','char_count']]

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39
...,...,...
31957,ate @user isz that youuu?ðððððð...,68
31958,to see nina turner on the airwaves trying to...,131
31959,listening to sad songs on a monday morning otw...,63
31960,"@user #sikh #temple vandalised in in #calgary,...",67


In [None]:
#Average word length

def avg_wordlen(sentence):

  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_wordlen(x))
train[['tweet','avg_word']]


Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.000000
...,...,...
31957,ate @user isz that youuu?ðððððð...,12.600000
31958,to see nina turner on the airwaves trying to...,4.652174
31959,listening to sad songs on a monday morning otw...,3.769231
31960,"@user #sikh #temple vandalised in in #calgary,...",5.500000


In [None]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#Number of stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet','stopwords']]

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1
...,...,...
31957,ate @user isz that youuu?ðððððð...,1
31958,to see nina turner on the airwaves trying to...,9
31959,listening to sad songs on a monday morning otw...,5
31960,"@user #sikh #temple vandalised in in #calgary,...",2


In [None]:
#Hashtags and mentions in a tweet

train['hashtag'] = train['tweet'].apply(lambda x: len([x for x in x.split() if (x.startswith('#') or x.startswith('@'))]))
train[['tweet','hashtag']]

Unnamed: 0,tweet,hashtag
0,@user when a father is dysfunctional and is s...,2
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1
...,...,...
31957,ate @user isz that youuu?ðððððð...,1
31958,to see nina turner on the airwaves trying to...,2
31959,listening to sad songs on a monday morning otw...,0
31960,"@user #sikh #temple vandalised in in #calgary,...",5


In [None]:
#Number of numerics
train['numeric_count'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet','numeric_count']]


Unnamed: 0,tweet,numeric_count
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0
...,...,...
31957,ate @user isz that youuu?ðððððð...,0
31958,to see nina turner on the airwaves trying to...,0
31959,listening to sad songs on a monday morning otw...,0
31960,"@user #sikh #temple vandalised in in #calgary,...",0


In [None]:
#Number of uppercase words
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet','upper']]


Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0
...,...,...
31957,ate @user isz that youuu?ðððððð...,0
31958,to see nina turner on the airwaves trying to...,0
31959,listening to sad songs on a monday morning otw...,0
31960,"@user #sikh #temple vandalised in in #calgary,...",0


BASIC PREPROCESSING

In [None]:
#Lower casing

train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet']

0        @user when a father is dysfunctional and is so...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model i love u take with u all the time in ur...
4                      factsguide: society now #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958    to see nina turner on the airwaves trying to w...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                       thank you @user for you follow
Name: tweet, Length: 31962, dtype: object

In [None]:
#Remove puntuation
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet']

0        user when a father is dysfunctional and is so ...
1        user user thanks for lyft credit i cant use ca...
2                                      bihday your majesty
3        model i love u take with u all the time in urð...
4                        factsguide society now motivation
                               ...                        
31957                   ate user isz that youuuðððððððððâï
31958    to see nina turner on the airwaves trying to w...
31959    listening to sad songs on a monday morning otw...
31960    user sikh temple vandalised in in calgary wso ...
31961                        thank you user for you follow
Name: tweet, Length: 31962, dtype: object

In [None]:
#Removing stopwords
train['tweet'] = train['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
train['tweet']

0        user father dysfunctional selfish drags kids d...
1        user user thanks lyft credit cant use cause do...
2                                           bihday majesty
3                    model love u take u time urð ðððð ððð
4                            factsguide society motivation
                               ...                        
31957                        ate user isz youuuðððððððððâï
31958    see nina turner airwaves trying wrap mantle ge...
31959      listening sad songs monday morning otw work sad
31960    user sikh temple vandalised calgary wso condem...
31961                                    thank user follow
Name: tweet, Length: 31962, dtype: object

In [None]:
#Common word remover
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
freq

user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64

In [None]:
freq_words = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in freq_words))
train['tweet']

0        father dysfunctional selfish drags kids dysfun...
1        thanks lyft credit cant use cause dont offer w...
2                                           bihday majesty
3                                  model take urð ðððð ððð
4                            factsguide society motivation
                               ...                        
31957                             ate isz youuuðððððððððâï
31958    see nina turner airwaves trying wrap mantle ge...
31959      listening sad songs monday morning otw work sad
31960      sikh temple vandalised calgary wso condemns act
31961                                         thank follow
Name: tweet, Length: 31962, dtype: object

In [None]:

rare = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
rare_words = list(rare.index)
train['tweet'] = train['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in rare_words))
train['tweet']

0        father dysfunctional selfish drags kids dysfun...
1        thanks lyft credit cant use cause dont offer w...
2                                           bihday majesty
3                                  model take urð ðððð ððð
4                            factsguide society motivation
                               ...                        
31957                             ate isz youuuðððððððððâï
31958    see nina turner airwaves trying wrap mantle ge...
31959      listening sad songs monday morning otw work sad
31960      sikh temple vandalised calgary wso condemns act
31961                                         thank follow
Name: tweet, Length: 31962, dtype: object

In [None]:
#Installing TEXTBLOB
!pip install -U textblob

Requirement already up-to-date: textblob in /usr/local/lib/python3.6/dist-packages (0.15.3)


In [None]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [None]:
#Spelling correction
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))


0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [None]:
#Tokenization
TextBlob(train['tweet'][1]).words

WordList(['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

In [None]:
#Stemming

from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: ' '.join(st.stem(x) for x in x.split()))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [None]:
#lemmatizing 
from textblob import  Word
train['tweet'][:5].apply(lambda x: ' '.join(Word(x).lemmatize() for x in x.split()))

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

ADVANCED TEXT PROCESSING

In [None]:
#n-grams
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drags']),
 WordList(['drags', 'kids']),
 WordList(['kids', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [None]:
#Team frequency
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(' '))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,dont,1
1,thanks,1
2,credit,1
3,wheelchair,1
4,use,1
5,vans,1
6,lyft,1
7,cant,1
8,offer,1
9,getthanked,1


In [None]:

#Inverse Document Frequency
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,dont,1,3.745585
1,thanks,1,4.597751
2,credit,1,7.327781
3,wheelchair,1,9.273691
4,use,1,3.542509
5,vans,1,8.426393
6,lyft,1,8.762865
7,cant,1,3.538194
8,offer,1,6.522155
9,getthanked,1,9.679156


In [None]:
tf1['tfidf'] = tf1['tf']*tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,dont,1,3.745585,3.745585
1,thanks,1,4.597751,4.597751
2,credit,1,7.327781,7.327781
3,wheelchair,1,9.273691,9.273691
4,use,1,3.542509,3.542509
5,vans,1,8.426393,8.426393
6,lyft,1,8.762865,8.762865
7,cant,1,3.538194,3.538194
8,offer,1,6.522155,6.522155
9,getthanked,1,9.679156,9.679156


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features =1000, lowercase=True, analyzer ='word', stop_words='english', ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['tweet'])
train_vect

<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 108902 stored elements in Compressed Sparse Row format>

In [None]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer='word')
train_bow = bow.fit_transform(train['tweet'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 123028 stored elements in Compressed Sparse Row format>

In [None]:
#Sentiment Analysis of first tweets
train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (-0.5, 1.0)
1     (0.2, 0.2)
2     (0.0, 0.0)
3     (0.0, 0.0)
4     (0.0, 0.0)
Name: tweet, dtype: object

In [None]:
#Making afeature
train['sentiment']=train['tweet'].apply(lambda x:TextBlob(x).sentiment[0])
train[['tweet','sentiment']]

WORD EMBDDINGDS

In [None]:
#Steps of working with pretraned word embeddings

from gensim.scripts.glove2word2vec impot glove2word2vec
glove_input_file     = 'downloaded file of wordembedding'
word2vec_output_file = 'downloaded file.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
#Loading above word2vec file as model
from gensims.models import GloVe model
filename = 'downloaded file.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)