In [40]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from IPython import display
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
sns.set(style='darkgrid', context='talk', palette='Dark2')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mk73680\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mk73680\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mk73680\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
df = pd.read_csv('dataset.csv')
pprint(df.head())
print(df.iloc[0,2])

   antwort_id                                              Title  \
0     8273117                                                NaN   
1    33446954  Angular JSON Display Single Result without Ng-...   
2    25183554                                                NaN   
3    25194037                                                NaN   
4    16695595                        Twitter iOS 6 oauth or not?   

                                                Body  Score  \
0  <p>You would have to custom code the second pa...      1   
1  <p>I'm still develop app for show news update ...      0   
2  <p>To get the user's information from Twitter,...      2   
3  <p>The Play JSON framework has <a href="http:/...      4   
4  <p>Am i correct in saying that you do not need...      0   

          CreationDate    parentid  Post Link  
0  2011-11-25 18:42:22   8268229.0    8273117  
1  2015-10-31 01:01:22         NaN   33446954  
2  2014-08-07 13:14:55  25178919.0   25183554  
3  2014-08-07 23:47:

In [None]:
sia = SIA()
results = []

for index, row in df.iterrows():
    pol_score = sia.polarity_scores(df.iloc[index,2])
    pol_score['headline'] = df.iloc[index,2]
    results.append(pol_score)
    
pprint(results[:3], width=100)

In [None]:
df = pd.DataFrame.from_records(results)
df.head()

In [None]:
#Using threshold of -0.2 and 0.2 for labelling data either as positive or negative
df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()

print("Positive headlines:\n")
pprint(list(df[df['label'] == 1].headline)[:5], width=200)

print("\nNegative headlines:\n")
pprint(list(df[df['label'] == -1].headline)[:5], width=200)

print(df.label.value_counts())
print(df.label.value_counts(normalize=True) * 100)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

counts = df.label.value_counts(normalize=True) * 100

sns.barplot(x=counts.index, y=counts, ax=ax)

ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()
    

In [None]:
stop_words = stopwords.words('english')
print(stop_words[:20])

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(example)
    
def process_text(headlines):
    tokens = []
    for line in headlines:
        toks = tokenizer.tokenize(line)
        toks = [t.lower() for t in toks if t.lower() not in stop_words]
        tokens.extend(toks)
    
    return tokens

In [None]:
#Positive words from labelled datasets
pos_lines = list(df[df.label == 1].headline)

pos_tokens = process_text(pos_lines)
pos_freq = nltk.FreqDist(pos_tokens)

pos_freq.most_common(20)
    

In [None]:
#Frequency distribution of positive words, it follows powerlaw distribution
y_val = [x[1] for x in pos_freq.most_common()]

fig = plt.figure(figsize=(10,5))
plt.plot(y_val)

plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Positive)")
plt.show()

In [None]:
#Plot of log which confirms that less words occupied more space
y_final = []
for i, k, z, t in zip(y_val[0::4], y_val[1::4], y_val[2::4], y_val[3::4]):
    y_final.append(math.log(i + k + z + t))

x_val = [math.log(i + 1) for i in range(len(y_final))]

fig = plt.figure(figsize=(10,5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Positive)")
plt.plot(x_val, y_final)
plt.show()
    

In [None]:
#Repeating same steps for negative words
neg_lines = list(df[df.label == -1].headline)

neg_tokens = process_text(neg_lines)
neg_freq = nltk.FreqDist(neg_tokens)

neg_freq.most_common(20)
    

In [None]:
y_val = [x[1] for x in neg_freq.most_common()]

fig = plt.figure(figsize=(10,5))
plt.plot(y_val)

plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Word Frequency Distribution (Negative)")
plt.show()

In [None]:
y_final = []
for i, k, z in zip(y_val[0::3], y_val[1::3], y_val[2::3]):
    if i + k + z == 0:
        break
    y_final.append(math.log(i + k + z))

x_val = [math.log(i+1) for i in range(len(y_final))]

fig = plt.figure(figsize=(10,5))

plt.xlabel("Words (Log)")
plt.ylabel("Frequency (Log)")
plt.title("Word Frequency Distribution (Negative)")
plt.plot(x_val, y_final)
plt.show()