In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from utils import prepropcess_data, get_embeddings_tfidf, get_embeddings_gzip
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
from utils import prepropcess_data

df_test = prepropcess_data(df)


df_final = df_test[['final_text', 'target']]
df_final

Unnamed: 0,final_text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident ask place notify officer evacuation s...,1
3,people receive wildfire evacuation order calif...,1
4,get sent photo ruby alaska smoke wildfire pour...,1
...,...,...
7608,two giant crane hold bridge collapse nearby ho...,1
7609,thetawniest control wild fire california even ...,1
7610,utc volcano hawaii http,1
7611,police investigate collide car little portugal...,1


In [4]:
X_train = df_final['final_text'].values
y_train = df_final['target'].values

# TRAIN MODELS

## SVC

In [5]:
# {'svm__C': 1, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 1)}
tfidf = TfidfVectorizer(max_df=0.75, ngram_range=(1, 1))


svm = SVC(C=1, gamma='scale', kernel='linear')

pipeline_svm = Pipeline([
    ('tfidf', tfidf),
    ('svm', svm)
])

pipeline_svm.fit(X_train, y_train)

## Logistic Regression

In [6]:
# {'lr__C': 10, 'lr__penalty': 'l2', 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 1)}
tfidf = TfidfVectorizer(max_df=0.75, ngram_range=(1, 1))


lr = LogisticRegression(C=10, penalty='l2')

pipeline_lr = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])

pipeline_lr.fit(X_train, y_train)

## MultinomialNB

In [7]:
# {'nb__alpha': 1, 'nb__fit_prior': True, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 2)}
tfidf = TfidfVectorizer(max_df=0.75, ngram_range=(1, 2))


nb = MultinomialNB(alpha=1, fit_prior=True)

pipeline_nb = Pipeline([
    ('tfidf', tfidf),
    ('nb', nb)
])

pipeline_nb.fit(X_train, y_train)

## Random Foreset

In [8]:
# {'rf__max_depth': None, 'rf__n_estimators': 50, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}
tfidf = TfidfVectorizer(max_df=0.5, ngram_range=(1, 1))


rf = RandomForestClassifier(max_depth=None, n_estimators=50)

pipeline_rf = Pipeline([
    ('tfidf', tfidf),
    ('rf', rf)
])

pipeline_rf.fit(X_train, y_train)

# PREDICT

In [21]:
df_test_ = pd.read_csv('test.csv')

In [17]:
from utils import prepropcess_data

df_test = prepropcess_data(df_test_)

df_final_test = df_test

X_test = df_final_test['final_text'].values

In [25]:
models_fit = [pipeline_svm, pipeline_lr, pipeline_nb, pipeline_rf]
models_pred = []
for model in models_fit:
    df_temp = df_test_.drop(['location', "keyword", 'text'], axis=1)
    df_temp['target'] = model.predict(X_test)
    models_pred.append(df_temp)


In [26]:
models_pred

[         id  target
 0         0       1
 1         2       1
 2         3       1
 3         9       1
 4        11       1
 ...     ...     ...
 3258  10861       1
 3259  10865       1
 3260  10868       1
 3261  10874       1
 3262  10875       0
 
 [3263 rows x 2 columns],
          id  target
 0         0       1
 1         2       1
 2         3       1
 3         9       1
 4        11       1
 ...     ...     ...
 3258  10861       1
 3259  10865       1
 3260  10868       1
 3261  10874       1
 3262  10875       0
 
 [3263 rows x 2 columns],
          id  target
 0         0       1
 1         2       1
 2         3       1
 3         9       1
 4        11       1
 ...     ...     ...
 3258  10861       1
 3259  10865       1
 3260  10868       1
 3261  10874       1
 3262  10875       1
 
 [3263 rows x 2 columns],
          id  target
 0         0       1
 1         2       1
 2         3       1
 3         9       1
 4        11       1
 ...     ...     ...
 3258  10861 

In [27]:
models_names = ['SVC', 'LR', 'NB', 'RF']
import os
for i in range(len(models_names)):
    file_name = f"{models_names[i]}.csv"
    models_pred[i].to_csv(file_name, index=False)