In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer,WordNetLemmatizer
import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dropout,Dense,BatchNormalization
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Read Datasets

In [None]:
train_df = pd.read_csv('./data/nlp-getting-started/train.csv')

In [None]:
test_df = pd.read_csv('./data/nlp-getting-started/test.csv')

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df['keyword'].unique()

In [None]:
train_df[['keyword','text','target']].head(50)

## Drop Unnecessary Columns

In [None]:
train_df.drop(columns=['keyword','location'],inplace=True)

In [None]:
test_df.drop(columns=['keyword','location'],inplace=True)

In [None]:
train_df

In [None]:
val_count = train_df['target'].value_counts()
val_count

In [None]:
val_count.plot(kind='bar')
plt.xlabel('Target Values')
plt.ylabel('Count')
plt.title('Bar Chart of Value Counts in Target Column')
plt.show()

## Data Cleaning

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove @mentions and hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

In [None]:
train_df['text'] = train_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(clean_text)

In [None]:
train_df['text']

In [None]:
train_df

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
def stem_text(text):
    # Stem each word in the text
    return ' '.join([stemmer.stem(word) for word in text.split()])

In [None]:
train_df['text'] = train_df['text'].apply(stem_text)

In [None]:
test_df['text'] = test_df['text'].apply(stem_text)

In [None]:
train_df

In [None]:
test_df

## Deep Learning

In [None]:
word_tokenizer = tf.keras.preprocessing.text.Tokenizer()
word_tokenizer.fit_on_texts(train_df['text'])
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

In [None]:
longest_train = max(train_df['text'], key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

In [None]:
max_words = vocab_length
max_len = length_long_sentence
sequences = word_tokenizer.texts_to_sequences(train_df['text'])

padded_sequences = pad_sequences(sequences, maxlen=max_len)
labels = to_categorical(train_df['target'])

model = Sequential()
model.add(Embedding(max_words, 128))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense(2, activation='sigmoid'))

model.compile(loss='BinaryCrossentropy', optimizer='Adamax', metrics=['accuracy'])

model.fit(padded_sequences, labels, epochs=10, batch_size=64)

In [None]:
sequences2 = word_tokenizer.texts_to_sequences(test_df['text'])
padded_sequences2 = pad_sequences(sequences2, maxlen=max_len)

In [None]:
predictions = model.predict(padded_sequences2)

In [None]:
prob_class_1 = predictions[:, 1]
binary_predictions = [1 if p >= 0.5 else 0 for p in prob_class_1]
df = pd.DataFrame({'target': binary_predictions}, index=test_df['id'])
df.index.name = 'id'
df.to_csv('/kaggle/working/submission.csv')

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df)
plt.title('Distribution of Labels in Training Set')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
pd.read_csv('/kaggle/working/submission.csv')

## Machine Learning

In [None]:
X = train_df['text']
y = train_df['target']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
tfidf = TfidfVectorizer(max_features=5000,stop_words='english')

In [None]:
X_train = tfidf.fit_transform(X_train)

In [None]:
X_test = tfidf.transform(X_test)

## LogisticRegression

In [None]:
lr = LogisticRegression(max_iter=1000)

In [None]:
lr.fit(X_train,y_train)

In [None]:
pred_y_lr = lr.predict(X_test)

In [None]:
accuracy_score(y_test,pred_y_lr)

## RandomForest

In [None]:
rf = RandomForestClassifier(n_estimators=71)

In [None]:
rf.fit(X_train,y_train)

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred_rf)