In [1]:
# Loading the data
import pandas as pd
from typing import List, Callable
from multiprocess.pool import Pool
import string
from tqdm import tqdm

# List of tokens that donot hold much importance.
# These stopwords are part of NLTK library but we 
# are hardcoding them to avoid unnecessary dependency on
# NLTK library
MAX_WORKERS = 20
STOP_WORDS = {'call', 'upon', 'still', 'nevertheless', 'down', 'every', 'forty', '‘re', 'always', 'whole', 'side', "n't", 'now', 'however', 'an', 'show', 'least', 'give', 'below', 'did', 'sometimes', 'which', "'s", 'nowhere', 'per', 'hereupon', 'yours', 'she', 'moreover', 'eight', 'somewhere', 'within', 'whereby', 'few', 'has', 'so', 'have', 'for', 'noone', 'top', 'were', 'those', 'thence', 'eleven', 'after', 'no', '’ll', 'others', 'ourselves', 'themselves', 'though', 'that', 'nor', 'just', '’s', 'before', 'had', 'toward', 'another', 'should', 'herself', 'and', 'these', 'such', 'elsewhere', 'further', 'next', 'indeed', 'bottom', 'anyone', 'his', 'each', 'then', 'both', 'became', 'third', 'whom', '‘ve', 'mine', 'take', 'many', 'anywhere', 'to', 'well', 'thereafter', 'besides', 'almost', 'front', 'fifteen', 'towards', 'none', 'be', 'herein', 'two', 'using', 'whatever', 'please', 'perhaps', 'full', 'ca', 'we', 'latterly', 'here', 'therefore', 'us', 'how', 'was', 'made', 'the', 'or', 'may', '’re', 'namely', "'ve", 'anyway', 'amongst', 'used', 'ever', 'of', 'there', 'than', 'why', 'really', 'whither', 'in', 'only', 'wherein', 'last', 'under', 'own', 'therein', 'go', 'seems', '‘m', 'wherever', 'either', 'someone', 'up', 'doing', 'on', 'rather', 'ours', 'again', 'same', 'over', '‘s', 'latter', 'during', 'done', "'re", 'put', "'m", 'much', 'neither', 'among', 'seemed', 'into', 'once', 'my', 'otherwise', 'part', 'everywhere', 'never', 'myself', 'must', 'will', 'am', 'can', 'else', 'although', 'as', 'beyond', 'are', 'too', 'becomes', 'does', 'a', 'everyone', 'but', 'some', 'regarding', '‘ll', 'against', 'throughout', 'yourselves', 'him', "'d", 'it', 'himself', 'whether', 'move', '’m', 'hereafter', 're', 'while', 'whoever', 'your', 'first', 'amount', 'twelve', 'serious', 'other', 'any', 'off', 'seeming', 'four', 'itself', 'nothing', 'beforehand', 'make', 'out', 'very', 'already', 'various', 'until', 'hers', 'they', 'not', 'them', 'where', 'would', 'since', 'everything', 'at', 'together', 'yet', 'more', 'six', 'back', 'with', 'thereupon', 'becoming', 'around', 'due', 'keep', 'somehow', 'n‘t', 'across', 'all', 'when', 'i', 'empty', 'nine', 'five', 'get', 'see', 'been', 'name', 'between', 'hence', 'ten', 'several', 'from', 'whereupon', 'through', 'hereby', "'ll", 'alone', 'something', 'formerly', 'without', 'above', 'onto', 'except', 'enough', 'become', 'behind', '’d', 'its', 'most', 'n’t', 'might', 'whereas', 'anything', 'if', 'her', 'via', 'fifty', 'is', 'thereby', 'twenty', 'often', 'whereafter', 'their', 'also', 'anyhow', 'cannot', 'our', 'could', 'because', 'who', 'beside', 'by', 'whence', 'being', 'meanwhile', 'this', 'afterwards', 'whenever', 'mostly', 'what', 'one', 'nobody', 'seem', 'less', 'do', '‘d', 'say', 'thus', 'unless', 'along', 'yourself', 'former', 'thru', 'he', 'hundred', 'three', 'sixty', 'me', 'sometime', 'whose', 'you', 'quite', '’ve', 'about', 'even'}

# Dataset downloaded from Kaggle.
SENTIMENTS = '/Users/hope/Downloads/test.csv'
df = pd.read_csv(SENTIMENTS, encoding='ISO-8859-1')

In [2]:
# Utility functions

def split_df(df: pd.DataFrame, chunk_size: int = 2000) -> List[pd.DataFrame]:
    """
    Function to split a pandas DF/series into chunks
    """
    index = 0
    splits = []
    while index < len(df.index):
        splits.append(df.iloc[index:index+chunk_size])
        index += chunk_size
    return splits



def parallel_apply(col: pd.Series, func: Callable) -> pd.Series:
    """
    Utility function to apply a function to all the entries
    of a series. 

    Internal series.apply works sequentially. This function replicates
    the behavior but runs parallaly on all the rows.
    """
    
    splits = split_df(col, 100)
    with Pool(MAX_WORKERS) as pool:
        results = []
        for split in splits:
            task = pool.apply_async(lambda x: x.apply(func), (split, ))
            results.append(task)
        parsed_splits = [future.get() for future in tqdm(results)]
    return pd.concat(parsed_splits)


def clean_text(doc: str) -> List[str]:
    """
    Returns a list of clean tokens for a given line.
    """
    
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = STOP_WORDS
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word.lower() for word in tokens if len(word) > 1]
    return tokens


In [3]:
# Cleaning the data
# Cleaning without multiprocessing as this is not a bottle neck

# 1. We only care about the text and corresponding sentiments
df = df[["text", "sentiment"]]

# 2. Dropping the bad rows
df = df[~df.sentiment.isna()]


# Cleaning with multiprocessing for bottlenecks
# 3. Creating a cleaned text column parallely. This improves the runtime
df["cleaned_text"] = parallel_apply(df.text, clean_text)
df

100%|███████████████████████████████████████| 36/36 [00:00<00:00, 274536.26it/s]


Unnamed: 0,text,sentiment,cleaned_text
0,Last session of the day http://twitpic.com/67ezh,neutral,"[last, session, day]"
1,Shanghai is also really exciting (precisely -...,positive,"[shanghai, exciting, precisely, skyscrapers, g..."
2,"Recession hit Veronique Branquinho, she has to...",negative,"[recession, hit, veronique, branquinho, quit, ..."
3,happy bday!,positive,"[happy, bday]"
4,http://twitpic.com/4w75p - I like it!!,positive,[like]
...,...,...,...
3529,"its at 3 am, im very tired but i can`t sleep ...",negative,"[im, tired, cant, sleep, try]"
3530,All alone in this old house again. Thanks for...,positive,"[all, old, house, thanks, net, keeps, alive, k..."
3531,I know what you mean. My little dog is sinkin...,negative,"[know, mean, my, little, dog, sinking, depress..."
3532,_sutra what is your next youtube video gonna b...,positive,"[sutra, youtube, video, gonna, love, videos]"


In [4]:
# Feature engineering
# Will be done parallely
# Heavy task
# - Create a vector matrix for each token and for each text.

# Step 1: Extracting all distinct tokens. Fast and doesnot require
# parallisation. Infact parallisation will reduce the overall performance
ALL_TOKENS = df.cleaned_text.sum()

def get_top_tokens(token_list, num_tokens = 3000):
    mp = {}
    for token in token_list:
        mp[token] = mp.get(token, 0) + 1
    mp = sorted(mp.items(), key=lambda item: item[1], reverse=True)
    top_tokens = []
    for token, freq in mp:
        top_tokens.append(token)
    return top_tokens[:500]

# Step 2: Creating feature matrix. Heavy task. As this will create a 2d space
# of Number of sentence x number of total tokens. will do using parallisation
ALL_TOKENS = get_top_tokens(ALL_TOKENS)
def create_vector_array(tokens):
    vector = []
    for token in ALL_TOKENS:
        if token in tokens:
            vector.append(1)
        else:
            vector.append(0)
    return vector

df["features"] = parallel_apply(df.cleaned_text, create_vector_array)
df["values"] = parallel_apply(df.sentiment, lambda value: 1 if value == "positive" else 0)

100%|██████████████████████████████████████████| 36/36 [00:00<00:00, 452.39it/s]
100%|███████████████████████████████████████| 36/36 [00:00<00:00, 653657.77it/s]


In [5]:
# 3. Dropping all the neutral sentiments as these will cause 
# issues while training a logistic regresssion. Logistic regressions
# need binary classifications. positive/negative works good with it. 
# positive can be treated as 1 and negative as 0. 
# The predictions will be a real number between 0 - 1. The closer the
# value to 1. The more positive the sentiment
train_df = df[df.sentiment != "neutral"]

In [6]:
features = train_df.features

In [7]:
values = train_df["values"].to_list()

In [8]:
# Training the model
# We have explored the concept of bagging where multiple models are generated parallely.
# Result is Average of all the models. It's supposed to give a more generalised result.

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression

class BaggedRegressions:
    def __init__(self, num_estimators: int):
        self.num_estimators = num_estimators

    def train(self, features, values):
        tol = 0.0001
        delta_tol = 0.0001
        with Pool(MAX_WORKERS) as pool:
            tasks = []
            for index in range(self.num_estimators):
                tasks.append(pool.apply_async(lambda tol: LogisticRegression(tol=tol).fit(features.to_list(), values), (tol, )))
                tol += delta_tol
            self.models = []
            current_task = 1
            for task in tasks:
                self.models.append(task.get())
                print(f"Got {current_task} / {self.num_estimators} models")
                current_task += 1

    def predict_probablity(self, feature):
        prob = 0
        for model in self.models:
            prob += model.predict_proba([feature])[0][1]
        return prob / self.num_estimators

In [9]:
bagged_regressions = BaggedRegressions(20)
bagged_regressions.train(features, values)

Got 1 / 20 models
Got 2 / 20 models
Got 3 / 20 models
Got 4 / 20 models
Got 5 / 20 models
Got 6 / 20 models
Got 7 / 20 models
Got 8 / 20 models
Got 9 / 20 models
Got 10 / 20 models
Got 11 / 20 models
Got 12 / 20 models
Got 13 / 20 models
Got 14 / 20 models
Got 15 / 20 models
Got 16 / 20 models
Got 17 / 20 models
Got 18 / 20 models
Got 19 / 20 models
Got 20 / 20 models


In [10]:
bagged_regressions.predict_probablity(df.features.iloc[0])

0.5027905455183762

In [11]:
import numpy as np

df["prediction_prob"] = parallel_apply(df.features, lambda x: bagged_regressions.predict_probablity(x))

100%|███████████████████████████████████████████| 36/36 [00:00<00:00, 41.20it/s]


In [12]:
df.head(20)

Unnamed: 0,text,sentiment,cleaned_text,features,values,prediction_prob
0,Last session of the day http://twitpic.com/67ezh,neutral,"[last, session, day]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0.502791
1,Shanghai is also really exciting (precisely -...,positive,"[shanghai, exciting, precisely, skyscrapers, g...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.830085
2,"Recession hit Veronique Branquinho, she has to...",negative,"[recession, hit, veronique, branquinho, quit, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0.296053
3,happy bday!,positive,"[happy, bday]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1,0.831863
4,http://twitpic.com/4w75p - I like it!!,positive,[like],"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.539352
5,that`s great!! weee!! visitors!,positive,"[thats, great, weee, visitors]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0.91436
6,I THINK EVERYONE HATES ME ON HERE lol,negative,"[think, everyone, hates, me, on, here, lol]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0,0.223599
7,"soooooo wish i could, but im in school and my...",negative,"[soooooo, wish, im, school, myspace, completel...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0.572136
8,and within a short time of the last clue all ...,neutral,"[short, time, clue]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0,0.681904
9,What did you get? My day is alright.. haven`...,neutral,"[what, my, day, alright, havent, leaving, soon...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0.331292


In [13]:
def text_to_feature(text: str):
    text = clean_text(text)
    return create_vector_array(text)

In [14]:
bagged_regressions.predict_probablity(text_to_feature("I am a very good person. Today I am happy"))

0.9504756933577841

In [15]:
bagged_regressions.predict_probablity(text_to_feature("I am a very sad person. I hate my life"))

0.0029226843225160703

In [16]:
bagged_regressions.predict_probablity(text_to_feature("Today was a decent day"))

0.6614674063108554

In [17]:
bagged_regressions.predict_probablity(text_to_feature("Life is what it is"))

0.4371464121568714

In [18]:
bagged_regressions.predict_probablity(text_to_feature("College life is fun"))

0.7910127779398521

In [19]:
bagged_regressions.predict_probablity(text_to_feature("I saved a life today. It made me happy"))

0.8354645736628861