In [65]:
import pandas as pd
import re
import numpy as np
from scipy.sparse import dok_matrix
from sklearn.svm import SVC
import os
import joblib

pd.options.mode.chained_assignment = None  # default='warn'


def preprocess_tweet(text):
    return [
        word
        for word in re.split(r"[^a-zA-Z0-9]+", re.sub(r"http\S+", "", text).lower())
        if len(word) > 0
    ]

In [66]:
tweets_df = pd.read_csv("train.txt", delimiter="\t")
tweets_df = tweets_df.sample(frac=1, random_state=0)
train_size = int(len(tweets_df) * 0.9)
test_size = len(tweets_df) - train_size
tweets_train_df = tweets_df.head(train_size)
tweets_test_df = tweets_df.tail(test_size)

In [67]:
tweets_train_df["tokenized_tweet"] = tweets_train_df["tweet"].apply(preprocess_tweet)
tweets_test_df["tokenized_tweet"] = tweets_test_df["tweet"].apply(preprocess_tweet)


train_terms = set()


for tokens in tweets_train_df["tokenized_tweet"].tolist():
    train_terms.update(tokens)


term_dict = {}


for i, term in enumerate(train_terms):
    term_dict[term] = i


tweets_train_df["bow_terms"] = tweets_train_df["tokenized_tweet"].apply(
    lambda x: [term_dict[term] for term in x]
)


tweets_test_df["bow_terms"] = tweets_test_df["tokenized_tweet"].apply(
    lambda x: [term_dict[term] for term in x if term in term_dict]
)


classes = tweets_df["sentiment"].unique()
classes_dict = {}
for i, c in enumerate(classes):
    classes_dict[c] = i

In [68]:
def tweets_df_to_model_input(df, term_dict):
    X = dok_matrix((len(df), len(term_dict)), dtype=np.int32)
    for i, bow_terms in enumerate(df["bow_terms"]):
        for term in bow_terms:
            X[i, term] += 1
    return X


def tweets_df_to_model_output(df, classes_dict):
    y = df["sentiment"].apply(lambda x: classes_dict[x]).values
    return y


def model_output_to_sentiment(output, dict):
    sentiments = []
    for out in output:

        sentiments.append(list(dict.keys())[list(dict.values()).index(out)])
    return sentiments


def create_single_tweet_model_input(tweet, term_dict):
    df = {"tweet": [tweet]}
    df = pd.DataFrame(df)
    df["tokenized_tweet"] = df["tweet"].apply(preprocess_tweet)
    df["bow_terms"] = df["tokenized_tweet"].apply(
        lambda x: [term_dict[term] for term in x if term in term_dict]
    )
    X = dok_matrix((len(df), len(term_dict)), dtype=np.int32)
    for i, bow_terms in enumerate(df["bow_terms"]):
        for term in bow_terms:
            X[i, term] += 1
    return X

In [69]:
X = tweets_df_to_model_input(tweets_train_df, term_dict)
y = tweets_df_to_model_output(tweets_train_df, classes_dict)

{'neutral': 0, 'positive': 1, 'negative': 2}


In [70]:
if os.path.isfile("svm_model.joblib"):
    model = joblib.load("svm_model.joblib")
else:
    model = SVC(C=1000)

    model.fit(X, y)
    joblib.dump(model, "svm_model.joblib")

In [71]:
df = tweets_test_df.head(50)

X = tweets_df_to_model_input(df, term_dict)

y_pred = model.predict(X)

for t, s in zip(
    df["tweet"].tolist(), model_output_to_sentiment(y_pred, classes_dict)
):
    print(s, "\t", t)

positive 	 I'm so frustrated with Game of Thrones and I'm only on the 10th episode
negative 	 @sassree Steve Bannon runs the racist antisemitic islamophoic website Breitbart which he professes to lead the alt-right that are Neo Nazis
neutral 	 """When your 2 1/2 yr old's 1st words this morning are """"It's Star Wars Day"""" b/c we are seeing a sneak peek tonight #ProudGeekMom #HesDarthVadar"""
positive 	 Good lord the food in Italy is so damn good.. Found a random cheap spot in Milan.. Venice tomorrow! http://t.co/PmyWIXQR5K
negative 	 As you may know I'm no lover of David Cameron, however his decision to authorise drone attacks to kill British terrorists was #SpotOn
neutral 	 """You sure it's not an Ice Cube, Dr Dre, and Eazy-E kinda day? https://t.co/aO0DtiDbSp"""
negative 	 Take that Liberals and Alt-Rightists! https://t.co/kwVVifhkYs
neutral 	 Ashley Graham Debuted ANOTHER New ‘Do at the 2016 VMAs #fitness #health #diet #gym #crossfit #muscle⇒ https://t.co/7x1qpuXFCH
neutral 	 All 

In [78]:
model = joblib.load("svm_model.joblib")

tweet = "where's the money skylar you bitch. im joking i love you"

X = create_single_tweet_model_input(tweet, term_dict)

y_pred = model.predict(X)

s = model_output_to_sentiment(y_pred, classes_dict)

print(s)

['positive']
