In [50]:
import pandas as pd
import re
import numpy as np
from scipy.sparse import dok_matrix
from sklearn.svm import SVC

pd.options.mode.chained_assignment = None  # default='warn'


def preprocess_tweet(text):
    return [
        word
        for word in re.split(r"[^a-zA-Z0-9]+", re.sub(r"http\S+", "", text).lower())
        if len(word) > 0
    ]

In [51]:
tweets_df = pd.read_csv("train.txt", delimiter="\t")
tweets_df = tweets_df.sample(frac=1, random_state=0)
train_size = int(len(tweets_df) * 0.8)
test_size = len(tweets_df) - train_size
tweets_train_df = tweets_df.head(train_size)
tweets_test_df = tweets_df.tail(test_size)

In [52]:
tweets_train_df["tokenized_tweet"] = tweets_train_df["tweet"].apply(preprocess_tweet)
tweets_test_df["tokenized_tweet"] = tweets_test_df["tweet"].apply(preprocess_tweet)


train_terms = set()


for tokens in tweets_train_df["tokenized_tweet"].tolist():
    train_terms.update(tokens)



term_dict = {}


for i, term in enumerate(train_terms):
    term_dict[term] = i



tweets_train_df["bow_terms"] = tweets_train_df["tokenized_tweet"].apply(
    lambda x: [term_dict[term] for term in x]
)



tweets_test_df["bow_terms"] = tweets_test_df["tokenized_tweet"].apply(
    lambda x: [term_dict[term] for term in x if term in term_dict]
)


categories = tweets_df["sentiment"].unique()
categories_dict = {}
for i, c in enumerate(categories):
    categories_dict[c] = i

In [53]:
def tweets_df_to_model_input(df, train_terms_len):
    X = dok_matrix((len(df), train_terms_len), dtype=np.int32)
    for i, bow_terms in enumerate(df["bow_terms"]):
        for term in bow_terms:
            X[i, term] += 1
    return X


def tweets_df_to_model_output(df):
    categories = df["sentiment"].unique()
    categories_dict = {}
    for i, c in enumerate(categories):
        categories_dict[c] = i
    y = df["sentiment"].apply(lambda x: categories_dict[x]).values
    return y


def model_output_to_sentiment(output, dict):
    sentiments = []
    for out in output:
        sentiments.append(list(dict.keys())[list(dict.values()).index(out)])
    return sentiments

In [54]:
X = tweets_df_to_model_input(tweets_train_df, len(train_terms))
y = tweets_df_to_model_output(tweets_train_df)

In [55]:
model = SVC(C=1000)
model.fit(X, y)

In [57]:
df = tweets_test_df.head(50)

X = tweets_df_to_model_input(df, len(train_terms))

y_pred = model.predict(X)

print(y_pred)

[0 0 0 2 1 0 2 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 1 0 1 0 2 0 1 0 0 1 0 1]


In [59]:
for t, s in zip(
    df["tweet"].tolist(), model_output_to_sentiment(y_pred, categories_dict)
):
    print(s,"\t" ,t)

neutral 	 To anyone in Briana's family who may stumble upon this let it be known that we've always believed you're bottom of the barrel trash but that
neutral 	 EU should demand Saakashvili get top gov position before anymore cash. https://t.co/5LcgJtDR3e via @FT
neutral 	 @RondaRousey will make Floyd Mayweather her bitch. i'll give her 35 seconds 1st round ;)
negative 	 @BuzzingWASP_ @roflmaoism To be honest, I don't even know where you Alt-Rightists get your swagger from, you got treated like bitches.
positive 	 """@baghdadinvest @abbasnasir59 Not just Christians, you are in the prayers of """"all of us"""" humans.May you stay safe and be free soon!God bless."""
neutral 	 Chuck Norris:jan strangle you with a cordless phone.
negative 	 Noah fence but I'm sick of people telling me what I should or shouldn't be eating e.g vegetarianism
neutral 	 Josh Hamilton (knee) is out of the Rangers' starting lineup again on Tuesday night against the Mariners.
positive 	 Can't wait to see the iPad 