### Only Text Model

We realised that there is a misalignment between Data.csv (train data) and Validation_data.csv (prediction challenge). They do not have the same subjects as we can see below:

In [None]:
df = pd.read_csv('./dataset/data.csv', encoding="ISO-8859-1")
df['subject'].unique()

array(['politicsNews', 'worldnews', 'News', 'politics', 'Government News',
       'left-news'], dtype=object)

In [None]:
df_val = pd.read_csv('./dataset/validation_data.csv', encoding="ISO-8859-1")
df_val['subject'].unique()

array(['worldnews', 'left-news', 'US_News', 'Middle-east'], dtype=object)

This mismatch between subject categories in train and validation datasets might provoque that our model takes bad decisions during predictions. This is why we decided to develop another model only taking into account the text + title. 

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib
from scipy.sparse import hstack

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df['combined'] = df['title'] + " " + df['text']
y = df['label']
df = df.drop(columns=['label', 'title', 'text', 'date','subject'])
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [17]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    return ' '.join(tokens)

In [19]:
X_train['clean_combined'] = X_train['combined'].apply(preprocess_text)
X_test['clean_combined'] = X_test['combined'].apply(preprocess_text)

In [21]:
tfidf_only_text = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_text = tfidf_only_text.fit_transform(X_train['clean_combined'])

In [None]:
joblib.dump(tfidf_only_text, './models/tfidf_vectorizer_onlytext.pkl')

['tfidf_vectorizer_onlytext.pkl']

In [23]:
X_test_text = tfidf_only_text.transform(X_test['clean_combined'])

In [24]:
# Obtain vocab
feature_names = tfidf_only_text.get_feature_names_out()
# convert to df
df_tfidf = pd.DataFrame(X_train_text.toarray(), columns=feature_names)
df_tfidf.head()

Unnamed: 0,abandon,abba,abc,abc news,abdullah,abe,abedin,abid,abil,abl,...,younger,your,youth,youtub,zealand,zero,zika,zimbabw,zone,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0329,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
model_onlytext = RandomForestClassifier(n_estimators=100, max_depth=25, n_jobs=-1, random_state=42)
model_onlytext.fit(X_train_text, y_train)

In [None]:
joblib.dump(model_onlytext, './models/random_forest_model_onlytext.pkl')

['random_forest_model_onlytext.pkl']

In [27]:
y_pred = model_onlytext.predict(X_test_text)

In [28]:
print(" Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

 Accuracy: 0.9964951808737014
Confusion Matrix:
 [[3975   21]
 [   7 3986]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      3996
           1       0.99      1.00      1.00      3993

    accuracy                           1.00      7989
   macro avg       1.00      1.00      1.00      7989
weighted avg       1.00      1.00      1.00      7989



In [None]:
model_onlytext = joblib.load('./models/random_forest_model.pkl')
tfidf_only_text = joblib.load('./models/tfidf_vectorizer.pkl')

In [52]:
df_val['combined'] = df_val['title'] + df_val['text']
df_val['clean_combined'] = df_val['combined'].apply(preprocess_text)

In [53]:
df_val_clean = df_val.drop(columns=['label', 'title', 'text', 'date','subject'])

In [56]:
X_val_text = tfidf_only_text.transform(df_val_clean['clean_combined'])


In [57]:
preds = model_onlytext.predict(X_val_text)

In [58]:
df_val['label'] = preds

In [59]:
df_val_final = df_val[['label', 'title', 'text', 'subject', 'date']]

In [60]:
df_val_final.head()

Unnamed: 0,label,title,text,subject,date
0,1,UK's May 'receiving regular updates' on London...,LONDON (Reuters) - British Prime Minister Ther...,worldnews,"September 15, 2017"
1,1,UK transport police leading investigation of L...,LONDON (Reuters) - British counter-terrorism p...,worldnews,"September 15, 2017"
2,1,Pacific nations crack down on North Korean shi...,WELLINGTON (Reuters) - South Pacific island na...,worldnews,"September 15, 2017"
3,1,Three suspected al Qaeda militants killed in Y...,"ADEN, Yemen (Reuters) - Three suspected al Qae...",worldnews,"September 15, 2017"
4,1,Chinese academics prod Beijing to consider Nor...,BEIJING (Reuters) - Chinese academics are publ...,worldnews,"September 15, 2017"


In [41]:
df_val_final.to_csv('predictions_validation_onlytext.csv', index=False)