In [None]:
!pip install --user -U nltk
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
# Disaster Prediction
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tag import DefaultTagger
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load the data 
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# Length of the dataframe
print(len(train_df))
print(len(test_df))

7613
3263


In [3]:
# Visualize the data 
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
train_df['target'].unique()

array([1, 0])

In [4]:
# Visualize the data
train_df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [5]:
# drop the irrelevant columns 
train_df = train_df.drop(['id', 'keyword', 'location'], axis=1)
train_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# Drop the irrelevant columns for the test file
test_df = test_df.drop(['id', 'keyword', 'location'], axis =1)
test_df.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
# Basic Preprocessing routine
def preprocessing(train_text:str)->str:
  """Function for processing the text in the data"""
  # Convert the text into tokens
  tokenized_words = nltk.word_tokenize(train_text)
  lemmatizer = WordNetLemmatizer()
  # Defining Tag
  tagging = DefaultTagger('NN')
  ps = PorterStemmer()

  for index in range(len(tokenized_words)):
    tokenized_words[index] = lemmatizer.lemmatize(tokenized_words[index])
    tokenized_words[index] = ps.stem(tokenized_words[index])
  
  tokens_with_tags = tagging.tag(tokenized_words)
  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  # Remove the punctuation
  punctuations = list(punctuation)
  filtered_sentence = [w for w in tokenized_words if not w.lower() in stop_words and w not in punctuation]
  processed_text = " ".join(filtered_sentence)
  return processed_text
  

# Separate the train data and labels into separate variables
train_text, train_labels = train_df['text'], train_df['target']
# Process the train text
for index in range(len(train_text)):
  train_text.iloc[index] = preprocessing(train_text.iloc[index])

test_text = test_df['text']
# Process the test text
for index in range(len(test_text)):
  test_text.iloc[index] = preprocessing(test_text.iloc[index])

test_text.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_text.iloc[index] = preprocessing(train_text.iloc[index])


0                             happen terribl car crash
1        heard earthquak differ citi stay safe everyon
2    forest fire spot pond goos flee across street ...
3                       apocalyps light spokan wildfir
4                typhoon soudelor kill 28 china taiwan
Name: text, dtype: object

In [9]:
# Processing text to vector 

# Bag of words (BOW)
cv = CountVectorizer(max_features = 50)  ##give it a max features as 3
Bagofwords = cv.fit_transform(train_text).toarray()
print(Bagofwords.shape)


(7613, 50)


In [10]:
# TF-idf 
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(train_text)
# print(tfidf_model) 
# print the full sparse matrix
print(tfidf_model.toarray())
print(tfidf_model.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(7613, 18814)


In [13]:
# Classification with Logistic Regression
clf = LogisticRegression(random_state=0).fit(Bagofwords, train_labels)

In [14]:
# Predict
Bagofwords_test = cv.fit_transform(test_text).toarray()
clf.predict(Bagofwords_test)

array([0, 0, 0, ..., 0, 0, 1])

In [15]:
# Have to split the data into train, validation, test