In [1]:
import os
import sys
import re
import time
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

In [2]:
dev_path = '../DSL2122_january_dataset/development.csv'
eva_path = '../DSL2122_january_dataset/evaluation.csv'

In [3]:
dev_ds = pd.read_csv(dev_path, low_memory=True)
dev_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224994 entries, 0 to 224993
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   sentiment  224994 non-null  int64 
 1   ids        224994 non-null  int64 
 2   date       224994 non-null  object
 3   flag       224994 non-null  object
 4   user       224994 non-null  object
 5   text       224994 non-null  object
dtypes: int64(2), object(4)
memory usage: 10.3+ MB


In [4]:
dev_ds.drop(columns=['ids', 'flag'], inplace=True)


In [5]:
'''
Hours of day coresponding to the part of the day

morning [5 - 12] -> 0.2
afternoon [13 - 18] -> 0.4
evening [19 - 22] -> 0.6
night [23 - 5] -> 0.8
'''

def hourToPartOfDay(hour):
    if hour < 4:
        return 0.8
    elif hour < 12:
        return 0.2
    elif hour < 18:
        return 0.4
    elif hour < 23:
        return 0.6
    else:
        return 0.8

dev_ds['date'] = pd.to_datetime(dev_ds['date'])
dev_ds['month'] = pd.DatetimeIndex(dev_ds['date']).month
dev_ds['day'] = pd.DatetimeIndex(dev_ds['date']).day
dev_ds['hour'] = pd.DatetimeIndex(dev_ds['date']).hour

dev_ds['part_of_day'] = dev_ds.hour.apply(lambda x: hourToPartOfDay(x))



In [6]:
dev_ds.drop(columns=['date'], inplace=True)

In [7]:
prob_positive = dev_ds.groupby(['user'])['sentiment']

In [8]:
def encode_users(df):
    probs = pd.DataFrame(df.groupby(['user'])['sentiment'].mean())

    # creating ratio between neg and pos
    probs['neg'] = 1 - probs['sentiment']
    probs['ratio'] = probs['sentiment'] / probs['neg']

    # removing infinite values
    probs['ratio'] = probs.ratio.map(lambda x: 80 if x == np.inf else x )

    enc_prob_ratio = probs.ratio.to_dict()

    df['encoded_usr'] = dev_ds['user'].map(enc_prob_ratio)

    return df

In [9]:
dev_ds = encode_users(dev_ds)

# text processing

In [10]:
# cleaning text

remove_tag = lambda x: re.sub(r'[@]\w+[a-zA-Z0-9] ', '',x)
remove_punct = lambda x: re.sub(r'[^\w\s]', '', x)
remove_links = lambda x: re.sub(r'http\w+[a-zA-Z0-9] ', '', x)
remove_strange_chars = lambda x: x.encode("ascii", "ignore").decode()
remove_repeated_chars = lambda x: re.sub(r'(.)\1+', r'\1', x)

dev_ds['cl_text'] = dev_ds.text.map(remove_tag)\
                                    .map(remove_punct)\
                                    .map(remove_links)\
                                    .map(remove_strange_chars)\
                                    .apply(str.lower)

dev_ds['cl_text'] = dev_ds['cl_text'].map(remove_repeated_chars)

In [11]:
# def cleaning_repeating_char(text):
#     return re.sub(r'(.)1+', r'1', text)
#
# dev_ds['cl_text'] = dev_ds['cl_text'].apply(lambda x: cleaning_repeating_char(x))
#

In [12]:
# tokenizing the tweet

tokenize =  lambda x: re.split('\W+', x)

dev_ds['tokenized'] = dev_ds.cl_text.map(tokenize)

most of the tweets contains a last empty char, cleaning again

In [13]:
def clean_tokenized(text): # change name
    text = [w for w in text if len(w) > 1]
    return text
dev_ds['tokenized'] = dev_ds['tokenized'].apply(lambda x: clean_tokenized(x))

In [14]:
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    if len(text) < 4: return text
    text = [word for word in text if word not in stopword]
    return text

dev_ds['no_stop_words'] = dev_ds.tokenized.apply(lambda x: remove_stopwords(x))

dev_ds['len_tweet'] = dev_ds.no_stop_words.apply(len)

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

dev_ds['stemmed'] = dev_ds.no_stop_words.apply(lambda x: stemming(x))

In [15]:
dev_ds.drop(columns=['text', 'cl_text', 'user', 'tokenized', 'no_stop_words'], inplace=True)


In [16]:
def dummy(doc):
    return doc

mask = dev_ds['sentiment'] == 1

In [17]:
pos_data = dev_ds[mask].drop(columns=['month', 'day', 'hour', 'part_of_day', 'encoded_usr', 'len_tweet'])
neg_data = dev_ds[~mask].drop(columns=['month', 'day', 'hour', 'part_of_day', 'encoded_usr', 'len_tweet'])

In [18]:
neg_data = neg_data.reset_index()
pos_data = pos_data.reset_index()

pos_data.shape, neg_data.shape

((130157, 3), (94837, 3))

In [52]:
tot_wc = WordCloud(max_words=10000, width = 3980 , height = 2080,
               collocations=False).generate(" ".join(dev_ds['stemmed'].str.join(' ')))

In [20]:
# pos_data = pos_data['stemmed'].str.join(' ')

In [21]:
# wc = WordCloud(max_words = 1000 , width = 3980 , height = 2080,
#                collocations=False).generate(" ".join(pos_data['stemmed'].str.join(' ')))
# plt.imshow(wc)
#
#

In [22]:

# wc = WordCloud(max_words = 1000 , width = 3980 , height = 2080,
#                collocations=False).generate(" ".join(neg_data['stemmed'].str.join(' ')))
# plt.imshow(wc)


In [23]:
tf = TfidfVectorizer(ngram_range=(1,3),
                     min_df=0.0001,
                     tokenizer=dummy,
                     preprocessor=dummy)
pos_tfidf = tf.fit_transform(pos_data['stemmed'])

pos_df_tfidf = pd.DataFrame(pos_tfidf.toarray(), columns=tf.get_feature_names())

pos_df_tfidf.shape

(130157, 1099)

In [24]:
p_data = pos_data[['sentiment']]

In [25]:
p_data.shape, pos_df_tfidf.shape


((130157, 1), (130157, 1099))

In [26]:
pos_df_tfidf.shape

(130157, 1099)

In [27]:
# concatenating the pos values with the tfidf

datasets = [p_data, pos_df_tfidf]
pos_df_tfidf = pd.concat(datasets, axis=1)

pos_df_tfidf.shape
# sys.getsizeof(dev_ds)/1073741824

(130157, 1100)

In [28]:
# del pos_data

In [29]:
# corr = pos_df_tfidf.corr()

# data = dev_ds['stemmed'].str.join(' ')

# plt.figure(figsize = (50,50))

In [30]:
# wc = WordCloud(max_words = 1000 , width = 1980 , height = 1080,
#                collocations=False).generate(" ".join(data))
# plt.imshow(wc)

In [35]:
'''
q(d) = the quality score
d = review
D = the collection of documents
I(t, d) = indicator function whose value is 1 if d contains the term t and -1 otherwise
|D| Pd02D q(d0) = average quality score
'''

def avg_quality_score(df):
    return df['sentiment'].mean()

def indicator(w, t):
    return 1 if w in t else -1


def get_word_corr(w, df, avg, ind_text, ind_sent):
    D = len(df.index)
    ssum = 0


    for el in df.values:
        ind = indicator(w, el[ind_text])
        q_d = el[ind_sent]

        ssum += (ind * (q_d - avg))

    return ssum / D

def word_score_correlation(bag, df):
    cors = {}
    avg_qual = avg_quality_score(df)
    col_text = list(df.columns).index('stemmed')
    col_sent = list(df.columns).index('sentiment')

    for w in bag:
        value = get_word_corr(w, df, avg_qual, col_text, col_sent)
        cors[w] = value

    return cors

In [53]:
ls = list(tot_wc.words_)

In [54]:
wsc = word_score_correlation(ls, dev_ds)

In [55]:
wsc = {k: v for k, v in sorted(wsc.items(), key=lambda item: item[1], reverse=True)}

In [56]:
words = list(wsc.items())

In [59]:
pos_words = words[0:2000]
neg_words = words[-2000:]

In [61]:
tf = TfidfVectorizer(ngram_range=(1,3),
                     min_df=0.0001,
                     tokenizer=dummy,
                     preprocessor=dummy)
pos_tfidf = tf.fit_transform(dev_ds['stemmed'])

full_pos_df_tfidf = pd.DataFrame(pos_tfidf.toarray(), columns=tf.get_feature_names())

full_pos_df_tfidf.shape

(224994, 9703)

In [62]:
def fin_df_tr(pos, neg, df):

    for col in df.columns:
        if col == 'sentiment':
            continue
        if (col not in pos) and (col not in neg):
            df.drop(columns=[col], inplace=True)

    return df

In [63]:
sent_df = dev_ds[['sentiment']]

datasets = [sent_df, full_pos_df_tfidf]
full_df_tr = pd.concat(datasets, axis=1)

In [64]:
ls_1 = full_df_tr.columns

In [65]:
ls_1 = list(ls_1)