## BaggingClassifier + TF-IDF 

In [1]:
import pandas as pd
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
training_data = pd.read_csv('train.csv')

#### Combining the 3 columns ( keyword + location + text ) - filling the NAN by a blank or ''

In [3]:
training_data['text'] = training_data['keyword'].fillna('') +' '+ training_data['location'].fillna('')+' '  \
                        + training_data['text'].fillna('')
training_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' ar...,1
3,6,,,"13,000 people receive #wildfires evacuation ...",1
4,7,,,Just got sent this photo from Ruby #Alaska a...,1


In [4]:
training_data = training_data.drop(columns=['id','keyword','location'],axis=1)
training_data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' ar...,1
3,"13,000 people receive #wildfires evacuation ...",1
4,Just got sent this photo from Ruby #Alaska a...,1


#### Data Cleaning step involves tokenizing, removing the stopwords and removing non-alphanumeric tokens

In [5]:
def clean_data(text):
    tokens = text.split()
    no_stopwords = [x for x in tokens if x not in stop_words]
    no_nonalphanum = [x.lower() for x in no_stopwords if x.isalnum()]
    return ' '.join(no_nonalphanum)

In [6]:
training_data['text'] = training_data['text'].apply(clean_data)
print(training_data.shape)
training_data.head()

(7613, 2)


Unnamed: 0,text,target
0,our deeds reason may allah forgive us,1
1,forest fire near la ronge canada,1
2,all residents asked notified no evacuation she...,1
3,people receive evacuation orders california,1
4,just got sent photo ruby smoke pours school,1


In [7]:
test_data = pd.read_csv('test.csv')
test_id = test_data['id']
test_data['text'] = test_data['keyword'].fillna('') +' '+ test_data['location'].fillna('')+' '  \
                        + test_data['text'].fillna('')
test_data = test_data.drop(columns=['id','keyword','location'],axis=1)
test_data['text'] = test_data['text'].apply(clean_data)
print(test_data.shape)
test_data.head()

(3263, 1)


Unnamed: 0,text
0,just happened terrible car crash
1,heard different stay safe
2,forest fire spot geese fleeing across i cannot...
3,apocalypse
4,typhoon soudelor kills 28 china taiwan


#### combining the training and testing vocabulary to fit the TF-IDF Vectorizer over all the available data 

In [8]:
combined_data = pd.DataFrame()
combined_data = combined_data.append(training_data,ignore_index=True,sort=False)
combined_data = combined_data.append(test_data,ignore_index=True,sort=False)
combined_data.shape

(10876, 2)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import f1_score, accuracy_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

#### fitting the TF-IDF Vectorizer with the combined data and then using it to transform the training data

In [10]:
tfidf = TfidfVectorizer()
tfidf.fit(combined_data.text)
text_tfidf = tfidf.transform(training_data.text)

In [11]:
x_train,x_test,y_train,y_test = train_test_split(text_tfidf,training_data.target)

In [12]:
log = LogisticRegression(class_weight='balanced',warm_start=True,solver='sag')
log.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=True)

In [13]:
predict = log.predict(x_test)
print('fl_score - ',f1_score(y_test,predict))
print('accuracy_score - ',accuracy_score(y_test,predict))

fl_score -  0.7525586995785671
accuracy_score -  0.7841386554621849


In [14]:
log.fit(text_tfidf,training_data.target)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=True)

#### Transforming the test data to vectors

In [15]:
test_tfidf = tfidf.transform(test_data.text)

#### Performing the predictions over the test data

In [16]:
predict = log.predict(test_tfidf)
predictions = pd.DataFrame({'id':test_id,'target':predict})
predictions.to_csv('submission.csv',index=False)

#### Accuracy :---
#####                - Local Accuracy : 76.47 (over split training data)
#####                - Online Accuracy : 79.44 (After fitting over split test data)
#####                - Online Accuracy : 77.81 (After fitting over all training data)