<a href="https://colab.research.google.com/github/mahekbagde/ML/blob/main/textclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

CTA-14 Mahek Bagde


Text classification using Word2Vec Python

In [None]:
!pip install scikit-learn gensim nltk
#gensim for word embeddings and topic modeling, and nltk (the Natural Language Toolkit for text processing and analysis.



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('/content/sample_data/movie_review.csv')
#select the column named 'text', the column named 'tag' from data.
#test_size indicating that 20% of the data will be reserved for testing, remaining 80% will be used for training.
#random_seed - running the code multiple times will yield the same results in terms of the data split.
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['tag'], test_size = 0.2, random_state = 42)


In [None]:
import nltk
#Stopwords are common words (like "the", "is", "and", etc.) that are often removed from
#  text data because they don't usually convey significant meaning.
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#String operations but here used to access the punctuation characters.
import string



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#loads the set of English stopwords from the NLTK corpus and stores them in a variable called stop_words.
stop_words = set(stopwords.words('english'))
def preprocess(text):
  #Convert the text to lowercase using lower() method.
  text = text.lower()
  #Remove punctuation from the text using list comprehension and join() method.
  text = ''.join([word for word in text if word not in string.punctuation])
  #Tokenize the text into words using word_tokenize() function.
  tokens = word_tokenize(text)
  #Remove stopwords from the tokenized text using list comprehension and the stopwords set (stop_words).
  tokens = [word for word in tokens if word not in stop_words]
  #Join the filtered tokens back into a single string using join() method.
  return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)


In [None]:
X_train

48063    patch never seemed understand possibility mayb...
7748     film stars matt damon hunting mathematical reb...
39824                      square one bizarre case casting
49957                             become walking wardrobes
33990    hong kong film tempo changes heightening emoti...
                               ...                        
62570    laughable moments include luc going strip club...
38158    even aspect film fails throwing convenient rid...
860      schreber contacted john reasons continually re...
15795    sure excellent bound usual suspects spring imm...
56422    dismal thirdrate farrelly brothers rip attempt...
Name: text, Length: 51776, dtype: object

In [None]:
X_test

58154                            like dream without appeal
33401    stateoftheart special effects never carpenter ...
44182    action films action sequences conventional att...
46480    number reasons including fact experienced film...
41584    julie james jennifer love hewitt ray bronson f...
                               ...                        
4456                                 exactly fifth element
27023    motion picture adapted elmore leonards novel r...
17103    scene ends tragedy clear theres kind powerful ...
63177         much horror everyone standing within earshot
49820    bills youngest daughter susan claire forlani f...
Name: text, Length: 12944, dtype: object

Train the Word2Vec model

Train a Word2Vec model on the preprocessed training data using Gensim package.

In [None]:
#Word2Vec class in Gensim is used to train word embeddings using the Word2Vec algorithm.
from gensim.models import Word2Vec

sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences,  window=5, min_count=5, workers=4)#workers - no of cpus

Vectorize the text data

In [None]:
import numpy as np

def vectorize(sentence):
   #splits the input sentence into individual words and stores them in a list named words.
   words = sentence.split()
   #For each word, it checks if the word exists in the vocabulary of the Word2Vec model
   # adds it to the words_vecs list.
   word_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
   if len(word_vecs) == 0:
       return np.zeros(100)
   words_vecs =  np.array(word_vecs)
   return words_vecs.mean(axis = 0)

X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

Train a Classification model

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluate the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)

In [None]:
import sklearn.metrics as metrics
Accuracy = metrics.accuracy_score(y_test, y_pred)
Precision = metrics.precision_score(y_test, y_pred, pos_label='pos')
Sensitivity_recall = metrics.recall_score(y_test, y_pred,pos_label='pos')
Specificity = metrics.recall_score(y_test, y_pred, pos_label='pos')
F1_score = metrics.f1_score(y_test, y_pred, pos_label='pos')
print({"Accuracy":Accuracy,"Precision":Precision,"Sensitivity_recall":Sensitivity_recall,"Specificity":Specificity,"F1_score":F1_score})

{'Accuracy': 0.5197775030902348, 'Precision': 0.5159560203807991, 'Sensitivity_recall': 0.8781378366042902, 'Specificity': 0.8781378366042902, 'F1_score': 0.65}


In [None]:
#Changing pos_label from 'pos' to 'neg' essentially swaps the positive and negative classes for the computation
#of precision and recall metrics. This can be useful in scenarios where you want to focus on the performance of the classifier for
#a specific class (positive or negative).

import sklearn.metrics as metrics
Accuracy = metrics.accuracy_score(y_test, y_pred)
Precision = metrics.precision_score(y_test, y_pred, pos_label='neg')
Sensitivity_recall = metrics.recall_score(y_test, y_pred,pos_label='neg')
Specificity = metrics.recall_score(y_test, y_pred, pos_label='neg')
F1_score = metrics.f1_score(y_test, y_pred, pos_label='neg')
print({"Accuracy":Accuracy,"Precision":Precision,"Sensitivity_recall":Sensitivity_recall,"Specificity":Specificity,"F1_score":F1_score})

{'Accuracy': 0.5197775030902348, 'Precision': 0.5441092771770063, 'Sensitivity_recall': 0.1500549364307016, 'Specificity': 0.1500549364307016, 'F1_score': 0.23523622047244094}
