In [1]:
import nltk
import glob
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn import naive_bayes

from sklearn.model_selection import cross_val_score
import csv

TEXT_CLEANING_RE = "https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words and not token.isdigit() and len(token) > 2:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

## Use training & eval

In [2]:
# training data
df_train = pd.read_csv("train_combine.csv" , encoding="utf8" ,error_bad_lines=False)
data_train = pd.DataFrame(df_train, columns=['text', 'tag'])

nan_rows = data_train[data_train['text'].isnull()]
print(nan_rows)
nan_rows = data_train[data_train['tag'].isnull()]
print(nan_rows)

Empty DataFrame
Columns: [text, tag]
Index: []
Empty DataFrame
Columns: [text, tag]
Index: []


In [3]:
data_train.text = data_train.text.apply(lambda x: preprocess(x))
data_train.head(5)
data_train.tail(5)

Unnamed: 0,text,tag
22903,sunday national ice cream day take capeflyer g...,1
22904,mattbowenthw dunkin sensible option may always...,1
22905,dloesch kurtschlichter gov john kasich says su...,-1
22906,money right way claim social security lump sum...,1
22907,fracking found cause quakes otago daily times ...,0


In [4]:
corpus = data_train['text']
# can try all or partial attributes
vectorizer_train = TfidfVectorizer()
vector_dataset_train = vectorizer_train.fit_transform(corpus)

In [5]:
vector_df_train = pd.DataFrame(vector_dataset_train.todense(), columns=vectorizer_train.get_feature_names())

print(len(vectorizer_train.get_feature_names()))
#vector_df_train.head(5)

34641


In [6]:
X_train = vector_df_train.values  
y_train = data_train['tag'].values  

print(len(X_train))
print(len(y_train))

22908
22908


In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, class_weight='balanced',
                            random_state=7, verbose=3, n_jobs=-1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50building tree 4 of 50
building tree 5 of 50

building tree 6 of 50building tree 7 of 50
building tree 8 of 50

building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.2min


building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.7min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=7,
            verbose=3, warm_start=False)

In [8]:
train_feature = vectorizer_train.get_feature_names()
print(len(train_feature))

34641


In [9]:
# eval data
df_eval = pd.read_csv("eval_combine.csv" , encoding="utf8" ,error_bad_lines=False)
data_eval = pd.DataFrame(df_eval, columns=['text', 'tag'])

nan_rows = data_eval[data_eval['text'].isnull()]
print(nan_rows)
nan_rows = data_eval[data_eval['tag'].isnull()]
print(nan_rows)

Empty DataFrame
Columns: [text, tag]
Index: []
Empty DataFrame
Columns: [text, tag]
Index: []


In [10]:
%%time
data_eval.text = data_eval.text.apply(lambda x: preprocess(x))
print(data_eval.head(5))

                                                text  tag
0  today tomorrow night work done midnight shame ...    0
1  time passing discussion christians always high...    0
2               wishhhhh going jason aldean tomorrow    1
3  kenklippenstein obviously right putin pen afd ...    0
4  thankful bitchy antics pointing animals shop s...    1
CPU times: user 218 ms, sys: 2.29 ms, total: 220 ms
Wall time: 219 ms


In [11]:
corpus_eval = data_eval['text']
vectorizer_eval = TfidfVectorizer(vocabulary= train_feature)
vector_dataset_eval = vectorizer_eval.fit_transform(corpus_eval)


In [12]:
vector_df_eval = pd.DataFrame(vector_dataset_eval.todense(), columns=vectorizer_eval.get_feature_names())

print(len(vectorizer_eval.get_feature_names()))
vector_df_eval.head(5)

34641


Unnamed: 0,00am,00ariana,00k,00pm,01am,01m,04th,05pm,0966kas,0bama,...,zquadom,zquadwantszayntosmile,zsuzsannakun,zubin,zucco,zuhour,zulfiqar,zum,zumba,zwei
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X_eval = vector_df_eval.values  
y_eval = data_eval['tag'].values

In [14]:
print(len(X_eval))
print(len(y_eval))

4914
4914


In [15]:
y_pred = clf.predict(X_eval)  

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished


In [16]:
# output eval dats with predicted tags

y_pred_eval_tag = y_pred.tolist()
data_eval_result = data_eval
data_eval_result['pred_tag'] = y_pred_eval_tag
data_eval_result.to_csv(r"eval_tag_result.csv", sep=',', encoding='utf-8')

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score

print("Accuracy : {:.4f}\n".format(accuracy_score(y_eval, y_pred)))
print("Avg F1_score : {:.4f}\n".format(f1_score(y_eval, y_pred,
                                                average='weighted')))

f1_scores = f1_score(y_eval, y_pred, average=None)
print("F1_scores")
for c, f1 in zip(["negative","neutral","positive"], f1_scores):
    print("{:8} : {:.4f}".format(c, f1))

Accuracy : 0.6302

Avg F1_score : 0.6177

F1_scores
negative : 0.4712
neutral  : 0.6922
positive : 0.5996


In [18]:
# predict the un-tagged testing data
df_test = pd.read_csv("test_combine.csv" , encoding="utf8" ,error_bad_lines=False)
data_test = pd.DataFrame(df_eval, columns=['text', 'tag'])

nan_rows = data_eval[data_test['text'].isnull()]
print(nan_rows)
nan_rows = data_eval[data_test['tag'].isnull()]
print(nan_rows)

Empty DataFrame
Columns: [text, tag, pred_tag]
Index: []
Empty DataFrame
Columns: [text, tag, pred_tag]
Index: []


In [19]:
%%time
data_test.text = data_test.text.apply(lambda x: preprocess(x))
print(data_test.head(5))

                                                text  tag
0  today tomorrow night work done midnight shame ...    0
1  time passing discussion christians always high...    0
2               wishhhhh going jason aldean tomorrow    1
3  kenklippenstein obviously right putin pen afd ...    0
4  thankful bitchy antics pointing animals shop s...    1
CPU times: user 218 ms, sys: 1.99 ms, total: 220 ms
Wall time: 219 ms


In [20]:
corpus_test = data_test['text']
vectorizer_test = TfidfVectorizer(vocabulary= train_feature)
vector_dataset_test = vectorizer_test.fit_transform(corpus_test)

In [21]:
vector_df_test = pd.DataFrame(vector_dataset_test.todense(), columns=vectorizer_eval.get_feature_names())

print(len(vectorizer_test.get_feature_names()))
vector_df_test.head(5)

34641


Unnamed: 0,00am,00ariana,00k,00pm,01am,01m,04th,05pm,0966kas,0bama,...,zquadom,zquadwantszayntosmile,zsuzsannakun,zubin,zucco,zuhour,zulfiqar,zum,zumba,zwei
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_test = vector_df_test.values

In [23]:
y_pred_test = clf.predict(X_test)  

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.2s finished


In [24]:
data_test.head()

Unnamed: 0,text,tag
0,today tomorrow night work done midnight shame ...,0
1,time passing discussion christians always high...,0
2,wishhhhh going jason aldean tomorrow,1
3,kenklippenstein obviously right putin pen afd ...,0
4,thankful bitchy antics pointing animals shop s...,1


In [25]:
data_test.columns

Index(['text', 'tag'], dtype='object')

In [26]:
#output test data with predicted result

# added by klaas
data_test = pd.DataFrame(df_eval, columns=['id', 'text', 'tag'])

data_test_id_list = data_test['id'].tolist()
data_test_text_list = data_test['text'].tolist()
y_pred_test_tag = y_pred_test.tolist()

with open('test_tag_result1.csv', 'w', newline='' , encoding="utf-8") as csvfile:
    tweet_writer = csv.writer(csvfile, delimiter=',',
                            quotechar=',', quoting=csv.QUOTE_MINIMAL)
    for i in range(0, len(data_test_id_list)):
        output_list = []
        output_list.append(data_test_id_list[i])
        output_list.append(data_test_text_list[i])
        if y_pred_test_tag[i] == 1:
            output_list.append("positive")
        elif y_pred_test_tag[i] == 0:
            output_list.append("neutral")
        elif y_pred_test_tag[i] == -1:
            output_list.append("negative")  
        tweet_writer.writerow(output_list)

# Using combined training & eval to get more attributes

In [27]:
# show the format of dataset
df_train = pd.read_csv("all_combine.csv" , encoding="utf8" ,error_bad_lines=False)
data_train = pd.DataFrame(df_train, columns=['text', 'tag'])
# print(len(data))

nan_rows = data_train[data_train['text'].isnull()]
print(nan_rows)
nan_rows = data_train[data_train['tag'].isnull()]
print(nan_rows)

Empty DataFrame
Columns: [text, tag]
Index: []
Empty DataFrame
Columns: [text, tag]
Index: []


In [28]:
data_train.text = data_train.text.apply(lambda x: preprocess(x))
data_train.head(5)
data_train.tail(5)

Unnamed: 0,text,tag
27835,hrtablaze beginning dictatorship gun control e...,-1
27836,son idc anymore going shawn tomorrow,0
27837,remember clinton foundation,0
27838,press worried murray dominated 3rd round djoke...,0
27839,rinashah using moto 2nd gen month absolute del...,1


In [29]:
corpus = data_train['text']
vectorizer_train = TfidfVectorizer()
vector_dataset_train = vectorizer_train.fit_transform(corpus)

In [30]:
vector_df_train = pd.DataFrame(vector_dataset_train.todense(), columns=vectorizer_train.get_feature_names())

print(len(vectorizer_train.get_feature_names()))

39056


In [31]:
X = vector_df_train.values  
y = data_train['tag'].values  

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)  

In [33]:
clf = RandomForestClassifier(n_estimators=50, class_weight='balanced',
                            random_state=7, verbose=3, n_jobs=-1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 50building tree 2 of 50

building tree 3 of 50
building tree 4 of 50building tree 5 of 50

building tree 6 of 50building tree 7 of 50

building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.1min


building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.3min finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=-1, oob_score=False, random_state=7,
            verbose=3, warm_start=False)

In [34]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.5s finished


In [38]:
print("Accuracy : {:.4f}\n".format(accuracy_score(y_test, y_pred)))

print("Avg F1_score : {:.4f}\n".format(f1_score(y_test, y_pred,
                                               average='weighted')))

f1_scores = f1_score(y_test, y_pred, average=None)
print("F1_scores")
for c, f1 in zip(["negative","neutral","positive"], f1_scores):
    print("{:8} : {:.4f}".format(c, f1))

Accuracy : 0.6249

Avg F1_score : 0.6131

F1_scores
negative : 0.4703
neutral  : 0.6911
positive : 0.5841


In [39]:
y_pred_test = clf.predict(X_test) 

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.5s finished


In [40]:
#output test data with predicted result
data_test_id_list = data_test['id'].tolist()
data_test_text_list = data_test['text'].tolist()
y_pred_test_tag = y_pred_test.tolist()

with open('test_tag_result2.csv', 'w', newline='' , encoding="utf-8") as csvfile:
    tweet_writer = csv.writer(csvfile, delimiter=',',
                            quotechar=',', quoting=csv.QUOTE_MINIMAL)
    for i in range(0, len(data_test_id_list)):
        output_list = []
        output_list.append(data_test_id_list[i])
        output_list.append(data_test_text_list[i])
        if y_pred_test_tag[i] == 1:
            output_list.append("positive")
        elif y_pred_test_tag[i] == 0:
            output_list.append("neutral")
        elif y_pred_test_tag[i] == -1:
            output_list.append("negative")  
        tweet_writer.writerow(output_list)
