In [1]:
import pandas as pd
import re
import numpy as np
from scipy.sparse import dok_matrix
from sklearn.svm import SVC

pd.options.mode.chained_assignment = None  # default='warn'


def preprocess_tweet(text):
    return [
        word
        for word in re.split(r"[^a-zA-Z0-9]+", re.sub(r"http\S+", "", text).lower())
        if len(word) > 0
    ]

In [2]:
tweets_df = pd.read_csv("train.txt", delimiter="\t")
tweets_df = tweets_df.sample(frac=1, random_state=0)
train_size = int(len(tweets_df) * 0.8)
test_size = len(tweets_df) - train_size
tweets_train_df = tweets_df.head(train_size)
tweets_test_df = tweets_df.tail(test_size)

In [21]:
tweets_train_df["tokenized_tweet"] = tweets_train_df["tweet"].apply(preprocess_tweet)
train_terms = set()
for tokens in tweets_train_df["tokenized_tweet"].tolist():
    train_terms.update(tokens)

term_dict = {}
for i, term in enumerate(train_terms):
    term_dict[term] = i

tweets_train_df["bow_terms"] = tweets_train_df["tokenized_tweet"].apply(
    lambda x: [term_dict[term] for term in x]
)

In [14]:
X = dok_matrix((len(tweets_train_df), len(train_terms)), dtype=np.int32)
for i, bow_terms in enumerate(tweets_train_df["bow_terms"]):
    for term in bow_terms:
        X[i, term] += 1

In [15]:
categories = tweets_train_df["sentiment"].unique()
categories_dict = {}
for i, c in enumerate(categories):
    categories_dict[c] = i

# Y FOLLOWS DATAFRAME
y = tweets_train_df["sentiment"].apply(lambda x: categories_dict[x]).values

In [16]:
model = SVC(C=1000)
model.fit(X, y)

In [26]:
def tweet_to_model_input(tweet, term_dict, train_terms):
    terms = preprocess_tweet(tweet)
    terms = [term_dict[term] for term in terms if term in term_dict]
    X = dok_matrix((1, len(train_terms)), dtype=np.int32)
    for term in terms:
        X[0, term] += 1
    return X

  (0, 22461)	1
  (0, 3943)	1
  (0, 27288)	1
  (0, 1887)	1


In [33]:
tweet = "fuck you"

X = tweet_to_model_input(tweet, term_dict, train_terms)

y_pred = model.predict(X)

print(y_pred)

print(list(categories_dict.keys())[list(categories_dict.values()).index(y_pred)])

[2]
negative
