In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
import heapq

stop_words = stopwords.words('english')

from glob import glob
from tqdm import tqdm

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nvs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nvs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
plt.style.use('grayscale')
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['figure.dpi'] = 1000

In [3]:
def stemmer(tokens):
    porter = PorterStemmer()
    stemmed = [porter.stem(token) for token in tokens]
    
    return stemmed

In [4]:
def get_train_data(data, sentiment):
    train_data_labeled = data[sentiment_df.label == sentiment]
    train_data_remaining = data[sentiment_df.label != sentiment].sample(len(train_data_labeled))
    train_data = pd.concat([train_data_labeled, train_data_remaining], ignore_index=True)

    return train_data

In [5]:
def filter_comment(comment):
    filtered_comment = comment.replace('\n','') # remove enters
    filtered_comment = filtered_comment.lower() # decapitalize
    filtered_comment = "".join([char for char in filtered_comment if char not in string.punctuation]) # remove punctuation
    filtered_comment = "".join([char for char in filtered_comment if char in "abcdefghijklmnopqrstuvwxyz "]) # remove strange chars
    
    return filtered_comment

In [6]:
def tokenize(filtered_comment):
    tokens = nltk.word_tokenize(filtered_comment) # Tokenize
    tokens = [token for token in tokens if token not in stop_words] # filter stop words

    return tokens

In [7]:
all_files = glob('sentiment data/data/*.txt')

In [8]:
# Filter all comments
filtered_data_dict = {}
for f in tqdm(all_files):
    comments = open(f, 'r', encoding='utf-8').readlines()

    filtered_data = []
    for comment in comments:
        if 'ago' in comment or 'award' in comment or 'carregar mais' in comment or 'permalinkembed' in comment or 'http' in comment:
            continue

        tokens = tokenize(comment)
        stemmed = tokenize(comment)

        if len(tokens) > 0:
            if f in filtered_data_dict.keys():
                filtered_data_dict[f].append(stemmed)
            else:
                filtered_data_dict[f] = [stemmed]

100%|██████████| 50/50 [00:05<00:00,  8.48it/s]


In [9]:
wordfreq = {}
for key in tqdm(filtered_data_dict.keys()):
    for sentence in filtered_data_dict[key]:
        for token in sentence:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1

100%|██████████| 50/50 [00:00<00:00, 1163.44it/s]


In [10]:
len(wordfreq.keys())

13918

In [11]:
# Filter all comments
filtered_data_dict = {}
for f in tqdm(all_files):
    comments = open(f, 'r', encoding='utf-8').readlines()

    filtered_data = []
    for comment in comments:
        if 'ago' in comment or 'award' in comment or 'carregar mais' in comment or 'permalinkembed' in comment or 'http' in comment:
            continue

        filtered_comment = filter_comment(comment)
        tokens = tokenize(filtered_comment)
        stemmed = stemmer(tokens)

        if len(tokens) > 0:
            if f in filtered_data_dict.keys():
                filtered_data_dict[f].append(stemmed)
            else:
                filtered_data_dict[f] = [stemmed]

100%|██████████| 50/50 [00:04<00:00, 10.50it/s]


In [12]:
wordfreq = {}
for key in tqdm(filtered_data_dict.keys()):
    for sentence in filtered_data_dict[key]:
        for token in sentence:
            if token not in wordfreq.keys():
                wordfreq[token] = 1
            else:
                wordfreq[token] += 1

100%|██████████| 50/50 [00:00<00:00, 1786.74it/s]


In [13]:
len(wordfreq.keys())

8117

In [14]:
most_freq = {}
for word in tqdm(wordfreq.keys()):
    if wordfreq[word] > 20:
        most_freq[word] = wordfreq[word]

100%|██████████| 8117/8117 [00:00<00:00, 1351856.96it/s]


In [15]:
len(most_freq.keys())

783

In [16]:
for f in tqdm(filtered_data_dict.keys()):
    sentence_vectors = []
    for sentence in filtered_data_dict[f]:
        sent_vec = []
        for token in most_freq.keys():
            if token in sentence:
                sent_vec.append(1)
            else:
                sent_vec.append(0)
        sentence_vectors.append(sent_vec)
    filtered_data_dict[f] = pd.DataFrame(sentence_vectors, columns = list(most_freq.keys()))

100%|██████████| 50/50 [00:06<00:00,  8.23it/s]


In [17]:
# Importing sentiment data
sent_df = pd.read_csv('sentiment data/crowdflower-sentiment-analysis-in-text/data/text_emotion.csv')

In [18]:
# Generating sentiment labels
sentence_vectors_sentiment = []
for i, row in tqdm(sent_df.iterrows(), total = len(sent_df)):
    filtered_comment = filter_comment(row.content)
    tokens = tokenize(filtered_comment)
    stemmed = stemmer(tokens)

    sent_vec = []
    for token in most_freq.keys():
        if token in stemmed:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors_sentiment.append(sent_vec + [row.sentiment])

sentiment_df = pd.DataFrame(sentence_vectors_sentiment, columns = list(most_freq.keys()) + ['label'])

100%|██████████| 40000/40000 [00:32<00:00, 1239.46it/s]


In [19]:
# How many words never appear
(sentiment_df != 0).any(axis=0).value_counts()

True     708
False     76
dtype: int64

In [20]:
# Remove words that never appear
features = []
for word, appears in (sentiment_df != 0).any(axis=0).iteritems():
    if appears and word != 'label':
        features.append(word)

In [21]:
sentiments = sent_df.sentiment.unique()

In [22]:
list(sentiments)

['empty',
 'sadness',
 'enthusiasm',
 'neutral',
 'worry',
 'surprise',
 'love',
 'fun',
 'hate',
 'happiness',
 'boredom',
 'relief',
 'anger']

In [23]:
classifiers = {}
for sentiment in tqdm(sentiments):
    classifiers[sentiment] = MLPClassifier(hidden_layer_sizes = [400, 300])
    train_data = get_train_data(sentiment_df[features + ['label']], sentiment)
    classifiers[sentiment].fit(train_data.drop('label', axis = 1), train_data.label == sentiment)

100%|██████████| 13/13 [18:43<00:00, 86.44s/it]


In [24]:
sentiment_results = []
for f in tqdm(filtered_data_dict.keys()):
    if len(filtered_data_dict[f]) > 0:
        player_results = [f.replace('sentiment data/data\\','').replace('.txt','').replace('_',' to ')]
        for sentiment in sentiments:
            pred = classifiers[sentiment].predict(filtered_data_dict[f][features])
            pred_prob = classifiers[sentiment].predict_proba(filtered_data_dict[f][features])[:,1]
            player_results.append(pred.mean())
        sentiment_results.append(player_results)

columns = ['player']
for sentiment in sentiments:
    columns.append(sentiment)

sentiment_results_df = pd.DataFrame(sentiment_results, columns = columns)

100%|██████████| 50/50 [00:19<00:00,  2.52it/s]


for i, row in sentiment_results_df.iterrows():
    for i, sentiment in enumerate(sentiments):
        plt.bar(i, row[sentiment], width = 0.2, color = 'C0')

    plt.title(row.player.replace('sentiment data/data\\','').replace('.txt','').replace('_',' to '))
    plt.xticks(range(len(sentiments)), sentiments, rotation=45)

    plt.savefig(row.player.replace('sentiment data/data','sentiment data/output').replace('txt','png'))

In [26]:
sentiment_results_df.to_csv('sentiment data/sentiment_results.csv')
sentiment_results_df.head()

Unnamed: 0,player,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
0,Alexandre Lacazette to Arsenal FC,0.594595,0.301802,0.436937,0.630631,0.378378,0.396396,0.31982,0.40991,0.599099,0.306306,0.445946,0.337838,0.626126
1,Alvaro Morata to Atletico Madrid,0.541667,0.337963,0.537037,0.425926,0.467593,0.449074,0.361111,0.513889,0.592593,0.356481,0.412037,0.402778,0.583333
2,Alvaro Morata to Chelsea FC,0.463203,0.350649,0.52381,0.5671,0.4329,0.515152,0.385281,0.502165,0.480519,0.350649,0.489177,0.467532,0.458874
3,Angel Di Maria to Manchester United,0.5,0.466667,0.514286,0.509524,0.438095,0.47619,0.295238,0.5,0.561905,0.342857,0.371429,0.4,0.585714
4,Angel Di Maria to Paris Saint-Germain,0.517241,0.46798,0.502463,0.448276,0.497537,0.492611,0.300493,0.482759,0.62069,0.310345,0.512315,0.482759,0.625616
