In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import keras
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [4]:
train=pd.read_csv('train.csv',index_col='id')
test=pd.read_csv('test.csv',index_col='id')
target=pd.read_csv('sample_submission.csv',index_col='id')

In [5]:
train.drop(['keyword','location'],axis=1,inplace=True)
test.drop(['keyword','location'],axis=1,inplace=True)

In [6]:
train.head()
#train.shape

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Our Deeds are the Reason of this #earthquake M...,1
4,Forest fire near La Ronge Sask. Canada,1
5,All residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train[train["target"] == 1]["text"].values[5]


'#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires'

In [8]:
import string

In [9]:
train['text']=train['text'].str.lower()

In [10]:
train.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,our deeds are the reason of this #earthquake m...,1
4,forest fire near la ronge sask. canada,1
5,all residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,just got sent this photo from ruby #alaska as ...,1


In [11]:
text=train['text']

In [12]:
text.head()

id
1    our deeds are the reason of this #earthquake m...
4               forest fire near la ronge sask. canada
5    all residents asked to 'shelter in place' are ...
6    13,000 people receive #wildfires evacuation or...
7    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

In [13]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','',string.punctuation))
text_clean=text.apply(lambda text:remove_punctuation(text))

In [14]:
text_clean.head()

id
1    our deeds are the reason of this earthquake ma...
4                forest fire near la ronge sask canada
5    all residents asked to shelter in place are be...
6    13000 people receive wildfires evacuation orde...
7    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [15]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

In [16]:
def stopwords_(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
text_clean = text_clean.apply(lambda text: stopwords_(text))

In [17]:
text_clean.head()

id
1         deeds reason earthquake may allah forgive us
4                forest fire near la ronge sask canada
5    residents asked shelter place notified officer...
6    13000 people receive wildfires evacuation orde...
7    got sent photo ruby alaska smoke wildfires pou...
Name: text, dtype: object

In [18]:
', '.join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
def lemma(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [20]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nanda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
from nltk.stem import WordNetLemmatizer   
lemmatizer = WordNetLemmatizer() 
text_clean=text_clean.apply(lambda text: lemma(text))

In [22]:
lemmatizer.lemmatize('wrote', 'v')

'write'

In [23]:
text_clean.head()

id
1           deed reason earthquake may allah forgive u
4                forest fire near la ronge sask canada
5    resident asked shelter place notified officer ...
6    13000 people receive wildfire evacuation order...
7    got sent photo ruby alaska smoke wildfire pour...
Name: text, dtype: object

In [35]:
#pip install wordcloud

In [25]:
from wordcloud import WordCloud


In [26]:
import matplotlib.pyplot as plt
fig, (ax1) = plt.subplots(1, figsize=[12,12])
wordcloud=WordCloud(background_color='white',width=600,height=400).generate(" ".join(text_clean))
WordCloud().generate(" ".join(text_clean))
ax1.imshow(wordcloud)
ax1.axis('off')
ax1.set_title('Frequent Words',fontsize=16)

Text(0.5, 1.0, 'Frequent Words')

In [27]:
train.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,our deeds are the reason of this #earthquake m...,1
4,forest fire near la ronge sask. canada,1
5,all residents asked to 'shelter in place' are ...,1
6,"13,000 people receive #wildfires evacuation or...",1
7,just got sent this photo from ruby #alaska as ...,1


In [28]:
text_clean.head()

id
1           deed reason earthquake may allah forgive u
4                forest fire near la ronge sask canada
5    resident asked shelter place notified officer ...
6    13000 people receive wildfire evacuation order...
7    got sent photo ruby alaska smoke wildfire pour...
Name: text, dtype: object

In [36]:
df = pd.DataFrame({"text": text_clean})
df.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1,deed reason earthquake may allah forgive u
4,forest fire near la ronge sask canada
5,resident asked shelter place notified officer ...
6,13000 people receive wildfire evacuation order...
7,got sent photo ruby alaska smoke wildfire pour...


In [30]:
train.update(df)

In [31]:
train.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,deed reason earthquake may allah forgive u,1
4,forest fire near la ronge sask canada,1
5,resident asked shelter place notified officer ...,1
6,13000 people receive wildfire evacuation order...,1
7,got sent photo ruby alaska smoke wildfire pour...,1


In [32]:
x=train.iloc[:,0]
y=train.iloc[:,-1]


In [33]:
from sklearn.model_selection import train_test_split

In [34]:
x_train, x_test, y_train, y_test = train_test_split(feature_matrix, y, test_size=0.33, random_state=42)

NameError: name 'feature_matrix' is not defined

In [None]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(train['text'])

In [None]:
feature_matrix.toarray()


In [84]:
logit = LogisticRegression()


In [85]:
kf = KFold(n_splits=10, shuffle=True, random_state=1)

In [87]:
cv_results = cross_val_score(logit, # Pipeline
                             feature_matrix, # Feature matrix
                             y, # Target vector
                             cv=kf, # Cross-validation technique
                             scoring="accuracy", # Loss function
                             n_jobs=-1)

In [88]:
cv_results.mean()

0.8027062402350822

In [92]:
logit.fit(feature_matrix,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [93]:
pred=logit.predict(x_test)

In [94]:
from sklearn.metrics import accuracy_score

In [95]:
score =accuracy_score(y_test,pred)

In [96]:
score

0.8949462793473936

In [None]:
63