# Textual data analysis

### Data: Tweet samples downloaded from the nltk library

In [None]:
! pip install gensim
! pip install nltk --upgrade

In [1]:
#Set up

import nltk

#RUN ONLY ONCE. When running the nltk.download() command a new window will open, 
#click the 'corpora' tab and download stopwords and twitter_samples 
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
#Import remaining libraries

import pandas as pd
from gensim.models import Word2Vec
import nltk
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.porter import PorterStemmer
from nltk.collocations import *

from nltk import tokenize

#%matplotlib inline

### Load data

There are three json files in twitter_samples. I will work on positive tweets

In [3]:
tweets = twitter_samples.strings('positive_tweets.json')

In [12]:
#Preview of the first four tweets
print(tweets[0:3])

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!', '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!']


### Tokenize words

In [7]:
#To tokenize the tweets (ie cut the tweets into words) we need to download 'punkt' from the nltk library
nltk.download('punkt')
tokens = [tokenize.word_tokenize(s) for s in tweets]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariabugge/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
#Let's look at how our third tweet: 
print(tokens[2])

['@', 'DespiteOfficial', 'we', 'had', 'a', 'listen', 'last', 'night', ':', ')', 'As', 'You', 'Bleed', 'is', 'an', 'amazing', 'track', '.', 'When', 'are', 'you', 'in', 'Scotland', '?', '!']


We can now analyze words instead of sentences. As a start, let's create a pandas series in which we'll have each word and how many times it appears in the dataset. 

In [13]:
count_words = pd.Series()
for s in tokens :
    count_words = count_words.append(pd.Series(s))

In [14]:
#Printing the top 10 most used words
count_words.value_counts()[:10]

:      6667
)      5165
@      5119
!      1920
you    1427
.      1323
#      1292
I      1177
to     1063
the     997
dtype: int64

We observe that the most common words are punctuation or stop words

In [17]:
#Let's redo the previous pandas series, but this time taking into account the category of the word (noun, verb,...) and counting 
#only nouns
nltk.download('averaged_perceptron_tagger') #needed to use the pos_tag function, which gives the category
word_cat = [nltk.pos_tag(s) for s in tokens]

nouns = pd.Series()
for sent in word_cat :
    nouns = nouns.append(pd.Series([w[0] for w in sent if "NN" in w[1]])) #NN = nouns 
print(nouns.value_counts()[:10])    #printing our top10 again

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mariabugge/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


@         2454
http       851
D          611
https      330
Thanks     203
day        199
thanks     176
amp        165
i          142
Hi         140
dtype: int64


Better! But still some noisy words. Moving on, we will clean the data. When cleaning textual data, we remove punctuation and lowercase all words

### Cleaning

In [18]:
tokens_no_punct_lower = []
for s in tokens  : 
    tokens_no_punct_lower.append([w.lower() for w in s if w.isalpha()])

In [21]:
tokens_no_punct_lower[2] #what our tweet looks like now

['despiteofficial',
 'we',
 'had',
 'a',
 'listen',
 'last',
 'night',
 'as',
 'you',
 'bleed',
 'is',
 'an',
 'amazing',
 'track',
 'when',
 'are',
 'you',
 'in',
 'scotland']

Next step, removing stopwords. Here, the tweets are in english, so we specify the english language

In [23]:
stop_words = stopwords.words('english')
#as the data is tweets, we will add some twitter-language-based words too, such as 'RT' or 'http'
stop_words.extend(["rt","http"])

In [24]:
#Let's remove the stopwords from our tweets
tokens_wo_stop_word = []
for s in tokens_no_punct_lower  : 
    tokens_wo_stop_word.append([w for w in s if w not in stop_words])

### Stemming (ie considering roots of words instead of whole word)

This allow for the algorithm to consider the singular and plural version to be the same word for instance, and not count everything twice.

In [25]:
stemmed = []
porter = PorterStemmer() #using the PorterStemmer function to stem words. 
for s in tokens_wo_stop_word : 
    stemmed.append([porter.stem(w) for w in s])

In [26]:
#We will redo the pandas series again and see what our top 10 looks like now:
wordcount_Stemmed = pd.Series()
for s in stemmed :
    wordcount_Stemmed = wordcount_Stemmed.append(pd.Series(s))

wordcount_Stemmed.value_counts()[:10]

thank     642
follow    446
love      399
http      336
u         247
day       241
good      238
like      232
happi     209
get       209
dtype: int64

### Word associations

Applying the Word2Vec method on our clean twitter sample. 

In [32]:
b = Word2Vec(tokens_wo_stop_word) #our model

Let's see the top 10 words most likely to be related to 'twitter' 

In [35]:
b.wv.most_similar(positive=["twitter"])

[('like', 0.9992236495018005),
 ('https', 0.9991927146911621),
 ('na', 0.9991474151611328),
 ('time', 0.9991419315338135),
 ('even', 0.9990895986557007),
 ('thanks', 0.9990859031677246),
 ('know', 0.9990695714950562),
 ('get', 0.9990658760070801),
 ('come', 0.9990612864494324),
 ('say', 0.9990571141242981)]

and the top10 least likely

In [36]:
b.wv.most_similar(negative=["twitter"])

[('fback', 0.08321775496006012),
 ('pleasse', -0.30082178115844727),
 ('sore', -0.8252904415130615),
 ('fridays', -0.8295408487319946),
 ('vikkfollows', -0.8992244005203247),
 ('jnlazts', -0.9073284864425659),
 ('mood', -0.9141058921813965),
 ('exactly', -0.9183617234230042),
 ('bae', -0.9233916401863098),
 ('belated', -0.9260453581809998)]