# Roll No: 225229119

### Importing libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import string
import numpy as np

In [2]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\1mscdsa19\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### Importing dataset

In [4]:
train_data= pd.read_csv("train (1).csv")
test_data = pd.read_csv('test (1).csv')

In [5]:
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


### Preprocessing :-
    

In [6]:
def preprocess1(text):
    text=str(text).lower() #Converts text to lowercase
    text=re.sub('\d+', '', text) #removes numbers
    text=re.sub('\[.*?\]', '', text) #removes HTML tags
    text=re.sub('https?://\S+|www\.\S+', '', text) #removes url
    text=re.sub(r"["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", "", text) #removes emojis
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text) #removes punctuation
    return text

In [7]:
train_data['clean_text']=train_data['text'].apply(preprocess1)
test_data['clean_text']=test_data['text'].apply(preprocess1)

train_data.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [8]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop=set(stopwords.words('english'))
stop.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\1mscdsa19\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
ps = PorterStemmer()

In [10]:
def stemming(text):
    stem_strings=list(map(lambda y: [ps.stem(word) for word in word_tokenize(y) if word not in stop],train_data['clean_text']))
    return stem_strings

In [11]:
text_after_stemming=stemming(train_data['clean_text'])
text_after_stemming[1:5]

[['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada'],
 ['resid',
  'ask',
  'shelter',
  'place',
  'notifi',
  'offic',
  'evacu',
  'shelter',
  'place',
  'order',
  'expect'],
 ['peopl', 'receiv', 'wildfir', 'evacu', 'order', 'california'],
 ['got',
  'sent',
  'photo',
  'rubi',
  'alaska',
  'smoke',
  'wildfir',
  'pour',
  'school']]

In [12]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\1mscdsa19\AppData\Roaming\nltk_data...


In [13]:
lemma=WordNetLemmatizer()
def preprocess2(text):
    final_text=text.apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split(' ') if word not in stop))
    return final_text

In [14]:
train_data['final']=preprocess2(train_data['clean_text'])
test_data['final']=preprocess2(test_data['clean_text'])

In [15]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,final
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...,people receive wildfire evacuation order cali...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,got sent photo ruby alaska smoke wildfire pour...


In [16]:
global dis_freq, ndis_freq
dis_freq=train_data.loc[train_data['target']==1, 'final'].str.split(expand=True).stack().value_counts().to_dict()
ndis_freq=train_data.loc[train_data['target']==0, 'final'].str.split(expand=True).stack().value_counts().to_dict()

In [17]:
def create_vector(tweet):
    total_dis =0
    total_ndis =0
    for word in tweet.split(' '):
        total_dis+=dis_freq[word] if word in dis_freq.keys() else 0
        total_ndis+=ndis_freq[word] if word in ndis_freq.keys() else 0 
    return [total_dis, total_ndis]


In [18]:
vector=train_data['final'].apply(create_vector)
vector2=test_data['final'].apply(create_vector)

In [19]:
df1 = pd.DataFrame(vector.values.tolist()).add_prefix('data')
df2 = pd.DataFrame(vector2.values.tolist()).add_prefix('data')
print(df1)

      data0  data1
0       220    217
1       392    119
2       159     72
3       366    126
4       192    208
5       588    146
6       425    163
7       371    460
8       273    184
9       144    297
10      227    160
11      191    214
12      115    163
13      104     40
14      291    164
15       55     84
16       13    100
17       16     35
18       80     54
19        0      1
20        1      3
21       17     31
22       11     95
23       46    102
24        0      1
25       41    125
26       46     92
27       11    100
28        0      1
29       92    257
...     ...    ...
7583    539    119
7584     22     85
7585    275     92
7586     43     19
7587     17     44
7588    313     65
7589     57     17
7590    364     71
7591    252    181
7592    457    146
7593     76    226
7594    113    169
7595    301     72
7596    342    159
7597    344     23
7598    125     76
7599    264     51
7600     96     34
7601    275     72
7602    110     81
7603    302 

### Model Building

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [21]:
def train_model(model,X,y, test):
    X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)    
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))
    return model.predict(test)

In [22]:
X=df1
y=train_data['target']

## 

In [23]:
lr = LogisticRegression()
y_pred=train_model(lr,X,y,df2)

             precision    recall  f1-score   support

          0       0.80      0.87      0.83      1326
          1       0.79      0.69      0.74       958

avg / total       0.79      0.80      0.79      2284



## Model Evaluation

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features = 2500, binary=True)
# Max-features - vector length
X = cv.fit_transform(train_data['final']).toarray()
X_test = cv.transform(test_data['final']).toarray()

In [25]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [26]:
y_pred=train_model(lr,X,y,X_test)

             precision    recall  f1-score   support

          0       0.80      0.86      0.83      1326
          1       0.78      0.69      0.74       958

avg / total       0.79      0.79      0.79      2284



In [27]:
#TD_IDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X_tdidf = cv.fit_transform(train_data['final'])
X_tdidf_test = cv.transform(test_data['final'])

In [28]:
X_tdidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [29]:
y_pred=train_model(lr,X_tdidf,y,X_tdidf_test)

             precision    recall  f1-score   support

          0       0.78      0.92      0.84      1326
          1       0.85      0.64      0.73       958

avg / total       0.81      0.80      0.79      2284



In [30]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
y_pred=train_model(mnb,X_tdidf,y,X_tdidf_test)

             precision    recall  f1-score   support

          0       0.78      0.92      0.84      1326
          1       0.85      0.65      0.74       958

avg / total       0.81      0.80      0.80      2284



In [31]:
submission = test_data[['id']].reset_index(drop=True)
submission['target'] = y_pred

In [32]:
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [33]:
submission.to_csv('submission.csv', index=False)

In [34]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0
