In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python


# import packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
# see more columns in dataframe
pd.set_option('display.max_columns', 100)

# machine learning packages
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# nlp packages
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag




[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [2]:
# get the package versions
print('the numpy version is:{}'.format(np.__version__))
print('the pandas version is:{}'.format(pd.__version__))
print('the scikit-learn version is:{}'.format(sklearn.__version__))
print('the re version is:{}'.format(re.__version__))
print('the nltk version is:{}'.format(nltk.__version__))

the numpy version is:1.26.4
the pandas version is:2.2.3
the scikit-learn version is:1.2.2
the re version is:2.2.1
the nltk version is:3.9.1


In [3]:
# look at list of stopwords
stoplist= stopwords.words('english')
print(stoplist)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
# make function to clean text and split into tokens
# return tokens as type list
# filter out the stop words
def make_token(string1):
    text1= string1.lower()
    # remove non alphabet characters
    text1= re.sub(r"[^a-z ]", "", text1)
    # tokenize text
    text1= word_tokenize(text1)
    # remove stopwords
    text2= [word for word in text1 if word not in stoplist]
    return text2


In [5]:
# test function to clean text and split into tokens
string10= "the black cat runs quickly to hide under the couch. it can't jump high."

string11= make_token(string10)

print(string11)

['black', 'cat', 'runs', 'quickly', 'hide', 'couch', 'cant', 'jump', 'high']


In [6]:
# define the wordnetlemmatizer
lemmatizer= WordNetLemmatizer()

In [7]:
# function to input tokens and output lemma

def make_lemma(list1):
    # find first character of pos_tag for each token
    char1= [word[1][0] for word in pos_tag(list1)]
    # dictionary for wordnet pos
    pos_dict= {'R':wordnet.ADV, 'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN}
    # list of wordnet pos for each token
    good_list= [pos_dict.get(word1, wordnet.NOUN) for word1 in char1]
    good_zip= zip(list1, good_list)
    # lemma for each token
    text2= " ".join([lemmatizer.lemmatize(x1, y1) for x1, y1 in good_zip])
    return text2



In [8]:
# test function to clean text and split into tokens
# test function to make lemma
string10= 'the black cat runs quickly to hide under the couch.  We ran fast.  The red car drove fast red. The striped bat hanging.'

print(pos_tag(word_tokenize(string10)))

print('-'*80)

string11= make_token(string10)

print(string11)

string12= make_lemma(string11)

print('-' * 80)

print(string12)

print('-' * 80)

[('the', 'DT'), ('black', 'JJ'), ('cat', 'NN'), ('runs', 'VBZ'), ('quickly', 'RB'), ('to', 'TO'), ('hide', 'VB'), ('under', 'IN'), ('the', 'DT'), ('couch', 'JJ'), ('.', '.'), ('We', 'PRP'), ('ran', 'VBD'), ('fast', 'RB'), ('.', '.'), ('The', 'DT'), ('red', 'JJ'), ('car', 'NN'), ('drove', 'VBD'), ('fast', 'RB'), ('red', 'JJ'), ('.', '.'), ('The', 'DT'), ('striped', 'JJ'), ('bat', 'NN'), ('hanging', 'NN'), ('.', '.')]
--------------------------------------------------------------------------------
['black', 'cat', 'runs', 'quickly', 'hide', 'couch', 'ran', 'fast', 'red', 'car', 'drove', 'fast', 'red', 'striped', 'bat', 'hanging']
--------------------------------------------------------------------------------
black cat run quickly hide couch ran fast red car drive fast red strip bat hanging
--------------------------------------------------------------------------------


In [9]:
# import the training data
train_df= pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

# import the test data
test_df= pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
x_test= test_df['text']
id_test= test_df['id']

# look at the first 20 rows of training dataframe
train_df.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [10]:
# split the training and validation
x_train, x_val, y_train, y_val= train_test_split(train_df['text'], train_df['target'], shuffle= True, random_state= 24, test_size= 0.3)

# look at shape after splitting
print('shape of x_train is:{}'.format(x_train.shape))
print('shape of x_val is:{}'.format(x_val.shape))
print('shape of y_train is:{}'.format(y_train.shape))
print('shape of y_val is:{}'.format(y_val.shape))

shape of x_train is:(5329,)
shape of x_val is:(2284,)
shape of y_train is:(5329,)
shape of y_val is:(2284,)


In [11]:
# look at first 20 rows of x_train
x_train.head(20)

1721    SSP East says a car AEG 061 driven by a young ...
1877    @jaureguiswisdom lmao well i only know one and...
1206    ? High Skies - Burning Buildings ? http://t.co...
4337    Swansea ?plot hijack transfer move for Southam...
4494    HURRICANE GUILLERMO LIVE NOAA TRACKING / LOOPI...
2513    I rated The Hobbit: The Desolation of Smaug (2...
4458    Broadcast journalism: hostages to fortune othe...
1089    Oops.\nH bomb lost 70 miles off the Okinawan c...
1520    Something Catastrophic Is Coming: Should We Tu...
6385    Pic of 16yr old PKK suicide bomber who detonat...
2951    So today I fell off a rock scraped my whole bu...
897     @Fantosex Now suck it up because that's all yo...
3757                  WCW @catsandsyrup THA BITCH IS FIRE
2477    Madhya Pradesh Train Derailment: Village Youth...
4974    President Barack Obama has on air meltdown ove...
502     Kelly Osbourne attacked for racist Donald Trum...
2832    .POTUS #StrategicPatience is a strategy for #G...
3043    @AGeek

In [12]:
# use functions to make_token and make_lemma
x_train_token= x_train.apply(make_token)
x_train_lemma= x_train_token.apply(make_lemma)

# apply functions to validation
x_val_token= x_val.apply(make_token)
x_val_lemma= x_val_token.apply(make_lemma)

# apply functions to test
x_test_token= x_test.apply(make_token)
x_test_lemma= x_test_token.apply(make_lemma)

In [13]:
# look at x_train after applying make_lemma
print(x_train_lemma.head(20))

1721    ssp east say car aeg drive young man collide a...
1877    jaureguiswisdom lmao well know one ive crush o...
1206      high sky burn building httptcouvqikx nowplaying
4337    swansea plot hijack transfer move southampton ...
4494    hurricane guillermo live noaa track loop wedau...
2513     rat hobbit desolation smaug imdb httptcodjdewdwr
4458    broadcast journalism hostages fortune otherwis...
1089    oopsh bomb lose mile okinawan coastfell ship h...
1520       something catastrophic come tune httptcoajzawi
6385    pic yr old pkk suicide bomber detonate bomb tu...
2951    today fell rock scrap whole butt nearly drown ...
897      fantosex suck thats youre bloody get mean amends
3757                      wcw catsandsyrup tha bitch fire
2477    madhya pradesh train derailment village youth ...
4974    president barack obama air meltdown opposition...
502     kelly osbourne attacked racist donald trump re...
2832    potus strategicpatience strategy genocide refu...
3043    ageeky

In [14]:
# look at x_val after applying make_lemma
print(x_val_lemma)

3068    kid get disney version game operation aa batte...
3148    update indiana state police reopen near lafaye...
3139    god forbid anyone family know answer phone nee...
7485    first wreck today glad mom okay couldve lot ba...
6023    exploration take seismic shift gabon somalia w...
                              ...                        
2953                               hope drown eeasterling
3845                    kelworldpeace taxstone yoga flame
2365    shame tookem intrigue dominant force job clist...
2525    nikostar yall lake ohio think yall abject deso...
7315    ariaahrary thetawniest control wild fire calif...
Name: text, Length: 2284, dtype: object


In [15]:
# look at x_test after applying make_lemma
print(x_test_lemma)

0                               happen terrible car crash
1       heard earthquake different city stay safe ever...
2       forest fire spot pond geese flee across street...
3                       apocalypse light spokane wildfire
4                      typhoon soudelor kill china taiwan
                              ...                        
3258    earthquake safety los angeles safety fastener ...
3259    storm ri bad last hurricane cityampothers hard...
3260      green line derailment chicago httptcoutbxlcbiuy
3261    meg issue hazardous weather outlook hwo httptc...
3262    cityofcalgary activate municipal emergency pla...
Name: text, Length: 3263, dtype: object


In [16]:
# use the tfidfvectorizer
#tf-idf= term frequency inverse document frequency

vectorizer= TfidfVectorizer()

# fit the training data
x_train_vector= vectorizer.fit_transform(x_train_lemma)

x_train_vector_df= pd.DataFrame(data= x_train_vector.toarray(), columns= vectorizer.get_feature_names_out())

# look at the first 10 rows
print(x_train_vector_df.head(10))

    aa  aaaa  aaarrrgghhhhttptcoguimbekge  aaceorg  aal  aampb  aan  \
0  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
1  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
2  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
3  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
4  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
5  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
6  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
7  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
8  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   
9  0.0   0.0                          0.0      0.0  0.0    0.0  0.0   

   aaronthefm  aashiqui   ab  aba  abandon  abandonedpics  abbandoned  \
0         0.0       0.0  0.0  0.0      0.0            0.0         0.0   
1         0.0       0.0  0.0  0.0      0.0            0.0         0.0   

In [17]:
# use the vectorizer for validation and testing

x_val_vector= vectorizer.transform(x_val_lemma)

x_test_vector= vectorizer.transform(x_test_lemma)

In [18]:
# Try logistic regression for text classification

logreg= LogisticRegression(random_state= 24)

# fit the classifier
logreg.fit(x_train_vector, y_train)

# predict for training
y_pred_train_log= logreg.predict(x_train_vector)
# predict for validation
y_pred_val_log= logreg.predict(x_val_vector)

# evaluate
print('f1 score for log reg on training is:{}'.format(f1_score(y_train, y_pred_train_log)))
print('f1 score for log reg on validation is:{}'.format(f1_score(y_val, y_pred_val_log)))

f1 score for log reg on training is:0.8610500610500611
f1 score for log reg on validation is:0.7498660953401177


In [19]:
# Try bayesian model for text classification

bayes1= MultinomialNB()

# fit the classifier
bayes1.fit(x_train_vector, y_train)

# predict for training
y_pred_train_bayes= bayes1.predict(x_train_vector)
# predict for validation
y_pred_val_bayes= bayes1.predict(x_val_vector)

# evaluate things
print('f1 score for bayes on training is:{}'.format(f1_score(y_train, y_pred_train_bayes)))
print('f1 score for bayes on validation is:{}'.format(f1_score(y_val, y_pred_val_bayes)))


f1 score for bayes on training is:0.8807915057915058
f1 score for bayes on validation is:0.7421875


In [20]:
# try the random forest classifier
forest1= RandomForestClassifier(random_state= 24)

# fit the thing
forest1.fit(x_train_vector, y_train)

# predict on training
y_pred_train_forest= forest1.predict(x_train_vector)
# predict on validation
y_pred_val_forest= forest1.predict(x_val_vector)

# evaluate things
print('f1 score for random forest on training:{}'.format(f1_score(y_train, y_pred_train_forest)))
print('f1 score for random forest on validation:{}'.format(f1_score(y_val, y_pred_val_forest)))

f1 score for random forest on training:0.9953467759805009
f1 score for random forest on validation:0.7397408207343412


random forest model is overfitting

In [21]:
# make the submission file
# import the training data
sample_df= pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
print(sample_df.head(20))

# choose the best model (highest validation f1 score): logistic regression
y_pred_test= logreg.predict(x_test_vector)

# make the dataframe
y_test = pd.Series(data= y_pred_test, name= 'target')
y_final= pd.concat([id_test, y_test], axis= 1)

print(y_final.head(20))


# export things
y_final.to_csv('/kaggle/working/submission.csv', index= False)



    id  target
0    0       0
1    2       0
2    3       0
3    9       0
4   11       0
5   12       0
6   21       0
7   22       0
8   27       0
9   29       0
10  30       0
11  35       0
12  42       0
13  43       0
14  45       0
15  46       0
16  47       0
17  51       0
18  58       0
19  60       0
    id  target
0    0       1
1    2       1
2    3       1
3    9       1
4   11       1
5   12       1
6   21       0
7   22       0
8   27       0
9   29       0
10  30       0
11  35       0
12  42       0
13  43       0
14  45       0
15  46       1
16  47       0
17  51       1
18  58       0
19  60       0
