In [99]:
import pandas as pd
df = pd.read_csv("data.csv")
df.columns

Index(['Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7',
       'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
       'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23',
       'Top24', 'Top25'],
      dtype='object')

In [100]:
#Data Preprocessing

#removing the special characters
df.replace("[^a-zA-Z0-9]", " ", regex=True, inplace = True)

#lowercase all the strings
for col in df.columns:
    if col != 'Label':
        df[col] = df[col].str.lower()

        
# splitting the test train based on date 01/01/2015 (problem statement) 
# although this is not recommended as we need to shuffle data
# to add randomness and similar distribution of data on both
# test and train datasets
train_data = df[df['Date'] <= '20150101']
test_data = df[df['Date'] > '20141231']
print(df.shape, train_data.shape, test_data.shape)

#extracting the input variables, features, independent variables
x_train = train_data.iloc[ : , 2:27]
x_test = test_data.iloc[ : , 2:27]
print(x_train.shape, x_test.shape)

#extracting the output variable, target, dependent variable
y_train = train_data.iloc[ : , 1]
y_test = test_data.iloc[ : , 1]
print(y_train.shape, y_test.shape)

(1989, 27) (1863, 27) (378, 27)
(1863, 25) (378, 25)
(1863,) (378,)


In [101]:
#combine the headlines of each day into a paragraph 
headlines = []
for row in range(len(x_train.index)):
    headlines.append(''.join(str(x) for x in x_train.iloc[row, 0 : 25]))
x_train['headlines'] = headlines

headlines = []
for row in range(len(x_test.index)):
    headlines.append(''.join(str(x) for x in x_test.iloc[row, 0 : 25]))
x_test['headlines'] = headlines

In [104]:
#CountVectorizer for embeddings and RandomForestClassifier for classification
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

#implement BAG of WORDS
count_vector = CountVectorizer(ngram_range = (1,2))
train_dataset = count_vector.fit_transform(x_train['headlines'])


#implement Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
rf_classifier.fit(train_dataset, y_train)

#predict for the test data
test_dataset = count_vector.transform(x_test['headlines'])
predictions = rf_classifier.predict(test_dataset)

In [109]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)
score = accuracy_score(y_test, predictions)
print(score)
report = classification_report(y_test, predictions)
print(report)

[[152  34]
 [ 23 169]]
0.8492063492063492
              precision    recall  f1-score   support

           0       0.87      0.82      0.84       186
           1       0.83      0.88      0.86       192

    accuracy                           0.85       378
   macro avg       0.85      0.85      0.85       378
weighted avg       0.85      0.85      0.85       378



In [111]:
#TF-IDF Vectorizer for embeddings and RandomForestClassifier for classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

tf_idf_vector = TfidfVectorizer(ngram_range=(2,2))
train_dataset = tf_idf_vector.fit_transform(x_train['headlines'])


#implement Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
rf_classifier.fit(train_dataset, y_train)

#predict for the test data
test_dataset = tf_idf_vector.transform(x_test['headlines'])
predictions = rf_classifier.predict(test_dataset)

In [112]:
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)
score = accuracy_score(y_test, predictions)
print(score)
report = classification_report(y_test, predictions)
print(report)

[[152  34]
 [ 26 166]]
0.8412698412698413
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       186
           1       0.83      0.86      0.85       192

    accuracy                           0.84       378
   macro avg       0.84      0.84      0.84       378
weighted avg       0.84      0.84      0.84       378



In [116]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(train_dataset, y_train)

predictions = nb_classifier.predict(test_dataset)

In [117]:
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)
score = accuracy_score(y_test, predictions)
print(score)
report = classification_report(y_test, predictions)
print(report)

[[130  56]
 [  0 192]]
0.8518518518518519
              precision    recall  f1-score   support

           0       1.00      0.70      0.82       186
           1       0.77      1.00      0.87       192

    accuracy                           0.85       378
   macro avg       0.89      0.85      0.85       378
weighted avg       0.89      0.85      0.85       378

