# Baseline model

This notebook uses a simple logistic regression to make a very basic prediction on the dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
SUFFIX = ""

EMBEDDINGS = f"tweet_embeddings/embeddings{SUFFIX}.txt"
VOCAB = f"tweet_embeddings/vocab{SUFFIX}.txt"

POS_TWEETS = f"twitter-datasets/train_pos{SUFFIX}.txt"
NEG_TWEETS = f"twitter-datasets/train_neg{SUFFIX}.txt"
TEST_DATA = f"twitter-datasets/test_data{SUFFIX}.txt"

# parse embeddings
vecs = {}
with open(EMBEDDINGS, "r") as f:
    for line in f:
        pline = line.rstrip().split(' ')
        word = pline[0]
        vecs[word] = np.array([float(x) for x in pline[1:]])

# parse vocabulary and build an index
with open(VOCAB, "r") as f:
    vocab = {x.rstrip().split(' ')[0]: i for i,x in enumerate(f)}

embeddings = np.zeros((len(vocab), len(vecs[list(vecs.keys())[0]])))
for w, v in vecs.items():
    if w == "<unk>":
        continue
    embeddings[vocab[w], :] = v


In [3]:
with open(NEG_TWEETS, "r") as f:
    n_tweets = [line.rstrip().split() for line in f]
with open(POS_TWEETS, "r") as f:
    p_tweets = [line.rstrip().split() for line in f]


In [4]:
testing_tweets = []
testing_tweets_ids = []
with open(TEST_DATA, "r") as f:
    for line in f:
        parsed_line = line.rstrip().split(',')
        testing_tweets.append(','.join(parsed_line[1:]).split())
        testing_tweets_ids.append(int(parsed_line[0]))


In [5]:
# convert a tweet to an embedding of shape (20,) which is the mean of each embedding of each word.
series_train = []
series_test = []

def load_tweets(tweets_list, series, label=None):
    print("Loading tweets...")
    i = 0
    tot = len(tweets_list)
    for tweet in tweets_list:
        if i%1000 == 0:
            print(f"{i}/{tot} ({int(i/tot*100)} %)")
        indices = [vocab.get(word, -1) for word in tweet if word in vocab.keys()]
        if len(indices) == 0:
            tweet_embedding = np.zeros((20,))
        else:
            tweet_embedding = np.mean(embeddings[indices], axis=0)
        serie_dict = {f'f{x+1}': data for x, data in enumerate(tweet_embedding)}
        if label is not None:
            serie_dict['label'] = label
        series.append(pd.Series(serie_dict))
        i+=1
    return series

# add both negative and positive tweets, will be shuffled later
series_train = load_tweets(p_tweets, series_train, 1)
series_train = load_tweets(n_tweets, series_train, -1)

# no label since this is the prediction set
series_test = load_tweets(testing_tweets, series_test)


Loading tweets...
0/100000 (0 %)
1000/100000 (1 %)
2000/100000 (2 %)
3000/100000 (3 %)
4000/100000 (4 %)
5000/100000 (5 %)
6000/100000 (6 %)
7000/100000 (7 %)
8000/100000 (8 %)
9000/100000 (9 %)
10000/100000 (10 %)
11000/100000 (11 %)
12000/100000 (12 %)
13000/100000 (13 %)
14000/100000 (14 %)
15000/100000 (15 %)
16000/100000 (16 %)
17000/100000 (17 %)
18000/100000 (18 %)
19000/100000 (19 %)
20000/100000 (20 %)
21000/100000 (21 %)
22000/100000 (22 %)
23000/100000 (23 %)
24000/100000 (24 %)
25000/100000 (25 %)
26000/100000 (26 %)
27000/100000 (27 %)
28000/100000 (28 %)
29000/100000 (28 %)
30000/100000 (30 %)
31000/100000 (31 %)
32000/100000 (32 %)
33000/100000 (33 %)
34000/100000 (34 %)
35000/100000 (35 %)
36000/100000 (36 %)
37000/100000 (37 %)
38000/100000 (38 %)
39000/100000 (39 %)
40000/100000 (40 %)
41000/100000 (41 %)
42000/100000 (42 %)
43000/100000 (43 %)
44000/100000 (44 %)
45000/100000 (45 %)
46000/100000 (46 %)
47000/100000 (47 %)
48000/100000 (48 %)
49000/100000 (49 %)
50000

In [6]:
# use DataFrames to represent data
print("Creating DataFrame...")
df_train = pd.DataFrame(series_train)

df_test = pd.DataFrame(series_test)
df_test["index"] = testing_tweets_ids
df_test.set_index(['index'], inplace=True) # keep indexes as in the input file


# print last 5 columns of the DataFrames (df_test has no "label" column)
print("Training DataFrame sample")
print(df_train[df_train.columns[-5:]].head(5))
print("Testing DataFrame sample")
print(df_test[df_test.columns[-5:]].head(5))


Creating DataFrame...
Training DataFrame sample
        f17       f18       f19       f20  label
0  0.006545  0.592248  0.016852 -1.591182    1.0
1 -0.261796  0.292175 -0.168170 -1.485552    1.0
2 -0.408993  0.321010  0.124718 -1.531494    1.0
3  0.148516  0.494902 -0.254856 -1.533331    1.0
4 -0.404575  0.489780 -0.030524 -1.233568    1.0
Testing DataFrame sample
            f16       f17       f18       f19       f20
index                                                  
1     -0.299459 -0.379505  0.444008  0.241766 -1.407697
2     -0.475540  0.139808  0.363823 -0.054524 -2.219004
3     -0.233857  0.135580 -0.039838 -0.145392 -1.633895
4     -0.152084  0.272331  0.429447 -0.185306 -1.995526
5     -0.001477  0.108543 -0.007657 -0.152091 -1.633170


In [10]:
RANDOM_SEED = 1234
# shuffle the dataframe
df_train = df_train.sample(n=df_train.shape[0], random_state=RANDOM_SEED)
X = df_train[df_train.columns[:-1]]
y = df_train[df_train.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)
# X_train.to_csv("X_train.csv", index=False)
# X_test.to_csv("X_test.csv", index=False)
# y_train.to_csv("y_train.csv", index=False)
# y_test.to_csv("y_test.csv", index=False)

In [11]:
logistic = LogisticRegression(penalty='l2', random_state=RANDOM_SEED+2, max_iter=100)
logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_test)
print(f1_score(y_test, y_pred))


0.7175030680951274


In [12]:
# make the predictions on the testing set
predictions = logistic.predict(df_test)
df_predictions = pd.DataFrame({"Id": df_test.index,
                               "Prediction": predictions},
                               dtype=int)


In [13]:
prediction_file = "predictions/very_baseline.csv"
print(f"Saving to {prediction_file}")
df_predictions.to_csv(prediction_file, index=False)

Saving to predictions/very_baseline.csv
