In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission_df = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

# Quick look at the data
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
train_df.head()


Train shape: (7613, 5)
Test shape: (3263, 4)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [5]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+","",text) # Remove URLs
    text = re.sub(r"@\w+","",text) # Remove mentions
    text = re.sub(r"#","",text) # Remove Hastag symbol
    text = re.sub(r"[^\w\s]","",text) # Remove punctuation
    text = re.sub(r"\d+","",text) # Remove Numbers
    return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words= 'english', max_features=5000)
x_train = vectorizer.fit_transform(train_df['clean_text'])
x_test = vectorizer.transform(test_df['clean_text'])

In [10]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, train_df['target'])

In [11]:
preds = model.predict(x_test)
submission_df['target'] = preds
submission_df.to_csv('submission.csv', index=False)

In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, x_train, train_df['target'],cv=5,scoring='accuracy').mean()

0.6956599767213717

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

train_df['clean_text'] = train_df['clean_text'].apply(lemmatize_text)
test_df['clean_text'] = test_df['clean_text'].apply(lemmatize_text)


In [14]:
train_df['keyword'] = train_df['keyword'].fillna('')
test_df['keyword'] = test_df['keyword'].fillna('')
train_df['location'] = train_df['location'].fillna('')
test_df['location'] = test_df['location'].fillna('')


train_df['combined'] = train_df['clean_text'] + " " + train_df['keyword'] + " " + train_df['location']
test_df['combined'] = test_df['clean_text'] + " " + test_df['keyword'] + " " + test_df['location']


X_train = vectorizer.fit_transform(train_df['combined'])
X_test = vectorizer.transform(test_df['combined'])


In [17]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, train_df['target'])
rf_preds = rf_model.predict(X_test)
submission_df['target'] = rf_preds
submission_df.to_csv('submission_rf.csv', index=False)


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.toarray())  
X_test_scaled = scaler.transform(X_test.toarray())

model = LogisticRegression(max_iter=500, solver='lbfgs')  
model.fit(X_train_scaled, train_df['target'])
predictions = model.predict(X_test_scaled)
submission_df['target'] = predictions
submission_df.to_csv('submission_logistic_regression.csv', index=False)


In [22]:
from sklearn.metrics import accuracy_score, classification_report


train_predictions = model.predict(X_train_scaled)
print("Training Accuracy: ", accuracy_score(train_df['target'], train_predictions))
print("Classification Report:\n", classification_report(train_df['target'], train_predictions))


if 'target' in submission_df.columns:
    test_predictions = model.predict(X_test_scaled)
    print("Test Accuracy: ", accuracy_score(submission_df['target'], test_predictions))
    print("Test Classification Report:\n", classification_report(submission_df['target'], test_predictions))


Training Accuracy:  0.9888348876921056
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4342
           1       0.99      0.98      0.99      3271

    accuracy                           0.99      7613
   macro avg       0.99      0.99      0.99      7613
weighted avg       0.99      0.99      0.99      7613

Test Accuracy:  1.0
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1810
           1       1.00      1.00      1.00      1453

    accuracy                           1.00      3263
   macro avg       1.00      1.00      1.00      3263
weighted avg       1.00      1.00      1.00      3263

