### MSDS Deep Learning Week 4: NLP Disaster Tweets Kaggle Mini-Project
Kaggle competition: Natural Language Processing with Disaster Tweets.

This Kaggle competition is about classifying texts. It is an excellent introduction to Natural Language Processing (NLP). 

#### Importing required libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import string
import numpy as np
nltk.download('wordnet')

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Explore data

In [2]:
dftrain = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
dftest = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
dftest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


#### Preprocess

In [5]:
def preprocess1(text):
    text=str(text).lower() 
    text=re.sub('\d+', '', text)
    text=re.sub('\[.*?\]', '', text)
    text=re.sub('https?://\S+|www\.\S+', '', text) 
    text=re.sub(r"[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF"  u"\U0001F680-\U0001F6FF"  u"\U0001F1E0-\U0001F1FF"  u"\U000024C2-\U0001F251" "]+", "", text) 
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text) 
    return text

In [6]:
dftrain['clean_text']=dftrain['text'].apply(preprocess1)
dftest['clean_text']=dftest['text'].apply(preprocess1)

dftrain.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop=set(stopwords.words('english'))
stop.remove('not')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
ps = PorterStemmer()

In [9]:
def stemming(text):
    stem_strings=list(map(lambda y: [ps.stem(word) for word in word_tokenize(y) if word not in stop],dftrain['clean_text']))
    return stem_strings

In [10]:
text_after_stemming=stemming(dftrain['clean_text'])
text_after_stemming[1:10]

[['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada'],
 ['resid',
  'ask',
  'shelter',
  'place',
  'notifi',
  'offic',
  'evacu',
  'shelter',
  'place',
  'order',
  'expect'],
 ['peopl', 'receiv', 'wildfir', 'evacu', 'order', 'california'],
 ['got',
  'sent',
  'photo',
  'rubi',
  'alaska',
  'smoke',
  'wildfir',
  'pour',
  'school'],
 ['rockyfir',
  'updat',
  'california',
  'hwi',
  'close',
  'direct',
  'due',
  'lake',
  'counti',
  'fire',
  'cafir',
  'wildfir'],
 ['flood',
  'disast',
  'heavi',
  'rain',
  'caus',
  'flash',
  'flood',
  'street',
  'manit',
  'colorado',
  'spring',
  'area'],
 ['im', 'top', 'hill', 'see', 'fire', 'wood'],
 ['there', 'emerg', 'evacu', 'happen', 'build', 'across', 'street'],
 ['im', 'afraid', 'tornado', 'come', 'area']]

In [11]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
lemma=WordNetLemmatizer()
def preprocess2(text):
    final_text=text.apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split(' ') if word not in stop))
    return final_text

In [13]:
dftrain.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [14]:
global dis_freq, ndis_freq
dis_freq=dftrain.loc[dftrain['target']==1, 'clean_text'].str.split(expand=True).stack().value_counts().to_dict()
ndis_freq=dftrain.loc[dftrain['target']==0, 'clean_text'].str.split(expand=True).stack().value_counts().to_dict()

In [15]:
def create_vector(tweet):
    total_dis =0
    total_ndis =0
    for word in tweet.split(' '):
        total_dis+=dis_freq[word] if word in dis_freq.keys() else 0
        total_ndis+=ndis_freq[word] if word in ndis_freq.keys() else 0 
    return [total_dis, total_ndis]

In [16]:
vector1=dftrain['clean_text'].apply(create_vector)
vector2=dftest['clean_text'].apply(create_vector)

In [17]:
df1 = pd.DataFrame(vector1.values.tolist()).add_prefix('data')
df2 = pd.DataFrame(vector2.values.tolist()).add_prefix('data')
print(df1)

      data0  data1
0      2930   3729
1       307    100
2      4070   4147
3      1433    927
4      2039   2583
...     ...    ...
7608   1276   1432
7609   8694   9495
7610   1057    950
7611   2848   2808
7612   2301   2326

[7613 rows x 2 columns]


#### Model Building

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [19]:
def train_model(model,X,y, test):
    X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)    
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))
    return model.predict(test)

In [20]:
X=df1
y=dftrain['target']

In [21]:
lr = LogisticRegression()
y_pred=train_model(lr,X,y,df2)

              precision    recall  f1-score   support

           0       0.77      0.77      0.77      1326
           1       0.68      0.68      0.68       958

    accuracy                           0.73      2284
   macro avg       0.72      0.72      0.72      2284
weighted avg       0.73      0.73      0.73      2284



In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features = 2500, binary=True)
# Max-features - vector length
X = cv.fit_transform(dftrain['clean_text']).toarray()
X_test = cv.transform(dftest['clean_text']).toarray()

In [23]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [24]:
y_pred=train_model(lr,X,y,X_test)

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1326
           1       0.78      0.70      0.74       958

    accuracy                           0.79      2284
   macro avg       0.79      0.78      0.78      2284
weighted avg       0.79      0.79      0.79      2284



In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X_tdidf = cv.fit_transform(dftrain['clean_text'])
X_tdidf_test = cv.transform(dftest['clean_text'])

In [26]:
X_tdidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [27]:
y_pred=train_model(lr,X_tdidf,y,X_tdidf_test)

              precision    recall  f1-score   support

           0       0.78      0.92      0.84      1326
           1       0.85      0.64      0.73       958

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.81      0.80      0.80      2284



In [28]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
y_pred=train_model(mnb,X_tdidf,y,X_tdidf_test)

              precision    recall  f1-score   support

           0       0.77      0.93      0.84      1326
           1       0.86      0.61      0.72       958

    accuracy                           0.80      2284
   macro avg       0.82      0.77      0.78      2284
weighted avg       0.81      0.80      0.79      2284



In [29]:
submission = dftest[['id']].reset_index(drop=True)
submission['target'] = y_pred

In [30]:
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [31]:
submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,0
3260,10868,1
3261,10874,1


In [32]:
submission.to_csv('Week4NLP_submission.csv', index=False)