<a href="https://colab.research.google.com/github/maitysuvo19/internship_root2ai_classify_text/blob/main/Root2ai_classification_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing necessary libraries

In [None]:
import os
import pandas as pd
import numpy as np

#import feature extraction methods from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

#pre-processing of text
import string
import re

#import classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

#import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report 
from sklearn import metrics

#import time function from time module to track the training duration
from time import time



In [None]:
df=pd.read_csv('/content/root2ai - Data.csv')

In [None]:
df.head()

Unnamed: 0,Text,Target
0,reserve bank forming expert committee based in...,Blockchain
1,director could play role financial system,Blockchain
2,preliminary discuss secure transaction study r...,Blockchain
3,security indeed prove essential transforming f...,Blockchain
4,bank settlement normally take three days based...,Blockchain


In [None]:
df["Target"].value_counts()/df.shape[0]

FinTech             0.376630
Cyber Security      0.116279
Bigdata             0.099850
Reg Tech            0.097163
credit reporting    0.076991
Blockchain          0.060562
Neobanks            0.047084
Microservices       0.043032
Stock Trading       0.034663
Robo Advising       0.032461
Data Security       0.015284
Name: Target, dtype: float64

In [None]:
df['Target'] =df.Target.map({'FinTech':1, 'Cyber Security':2,'Bigdata':3,'Reg Tech':4,'credit reporting':5,'Blockchain':6,'Neobanks':7,'Microservices':8,'Stock Trading':9,'Robo Advising':10,'Data Security':11})
df = df[["Text","Target"]]
df.head()

Unnamed: 0,Text,Target
0,reserve bank forming expert committee based in...,6
1,director could play role financial system,6
2,preliminary discuss secure transaction study r...,6
3,security indeed prove essential transforming f...,6
4,bank settlement normally take three days based...,6


# Text Pre-processing

Typical steps involve tokenization, lower casing, removing, stop words, punctuation markers etc, and vectorization. Other processes such as stemming/lemmatization can also be performed. Here, we are performing the following steps: removing br tags, punctuation, numbers, and stopwords. While we are using sklearn's list of stopwords, there are several other stop word lists (e.g., from NLTK) or sometimes, custom stopword lists are needed depending on the task.



In [None]:
stopwords = stop_words.ENGLISH_STOP_WORDS
def clean(doc): #doc is a string of text
    doc = doc.replace("</br>", " ") #This text contains a lot of <br/> tags.
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])
    #remove punctuation and numbers
    return doc

# Modeling

In [None]:
#Step 1: train-test split
X = df.Text.fillna(' ') #the column text contains textual data to extract features from
y = df.Target #this is the column we are learning to predict. 
print(X.shape, y.shape)
# split X and y into training and testing sets. By default, it splits 75% training and 25% test
#random_state=1 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(22704,) (22704,)
(17028,) (17028,)
(5676,) (5676,)


In [None]:
#Step 2-3: Preprocess and Vectorize train and test data
vect = TfidfVectorizer(preprocessor=clean) #instantiate a vectoriezer
X_train_dtm = vect.fit_transform(X_train)#use it to extract features from training data
#transform testing data (using training data's features)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)
#i.e., the dimension of our feature vector is 31195!

(17028, 11329) (5676, 11329)


Naive Bayse Classifier

In [None]:
#Step 3: Train the classifier and predict for test data
nb = MultinomialNB() #instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)#make class predictions for X_test_dtm
#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

CPU times: user 13.3 ms, sys: 860 µs, total: 14.1 ms
Wall time: 17.4 ms
Accuracy:  0.5266032417195208
              precision    recall  f1-score   support

           1       0.45      0.99      0.62      2168
           2       0.78      0.23      0.36       655
           3       0.94      0.46      0.62       577
           4       0.95      0.63      0.75       532
           5       0.86      0.09      0.16       424
           6       0.85      0.05      0.09       355
           7       1.00      0.01      0.01       272
           8       1.00      0.06      0.11       225
           9       1.00      0.06      0.11       188
          10       0.00      0.00      0.00       185
          11       1.00      0.03      0.06        95

    accuracy                           0.53      5676
   macro avg       0.80      0.24      0.26      5676
weighted avg       0.70      0.53      0.44      5676



  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression Classifier

In [None]:
logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
logreg.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = logreg.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.5910852713178295
              precision    recall  f1-score   support

           1       0.80      0.44      0.57      2168
           2       0.54      0.57      0.56       655
           3       0.67      0.84      0.74       577
           4       0.85      0.79      0.82       532
           5       0.61      0.66      0.64       424
           6       0.46      0.66      0.54       355
           7       0.30      0.57      0.39       272
           8       0.52      0.70      0.60       225
           9       0.44      0.70      0.54       188
          10       0.35      0.61      0.45       185
          11       0.28      0.43      0.34        95

    accuracy                           0.59      5676
   macro avg       0.53      0.63      0.56      5676
weighted avg       0.65      0.59      0.60      5676



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Support Vector Machine

In [None]:
svm = LinearSVC(class_weight='balanced') #instantiate a support vector machine model
svm.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = svm.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.6460535588442565
              precision    recall  f1-score   support

           1       0.76      0.60      0.67      2168
           2       0.57      0.61      0.59       655
           3       0.75      0.84      0.79       577
           4       0.88      0.81      0.84       532
           5       0.58      0.67      0.62       424
           6       0.55      0.61      0.58       355
           7       0.38      0.49      0.43       272
           8       0.60      0.68      0.64       225
           9       0.49      0.69      0.57       188
          10       0.38      0.55      0.45       185
          11       0.32      0.40      0.36        95

    accuracy                           0.65      5676
   macro avg       0.57      0.63      0.59      5676
weighted avg       0.67      0.65      0.65      5676



Our large feature vector could be creating a lot of noise in the form of very rarely occurring features that are not useful for learning. Let us change the count vectorizer to take a certain number of features as maximum.

In [None]:
#Step 2-3: Preprocess and Vectorize train and test data
vect = TfidfVectorizer(preprocessor=clean,max_features=5000) #instantiate a vectoriezer
X_train_dtm = vect.fit_transform(X_train)#use it to extract features from training data
#transform testing data (using training data's features)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)
#i.e., the dimension of our feature vector is 5000!

(17028, 5000) (5676, 5000)


Naive Bayse Classifier

In [None]:
#Step 3: Train the classifier and predict for test data
nb = MultinomialNB() #instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)#make class predictions for X_test_dtm
#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

CPU times: user 11.5 ms, sys: 1.3 ms, total: 12.8 ms
Wall time: 13.8 ms
Accuracy:  0.5805144467935166
              precision    recall  f1-score   support

           1       0.49      0.98      0.66      2168
           2       0.70      0.36      0.48       655
           3       0.89      0.54      0.67       577
           4       0.92      0.71      0.80       532
           5       0.86      0.23      0.37       424
           6       0.89      0.13      0.23       355
           7       1.00      0.02      0.04       272
           8       1.00      0.17      0.30       225
           9       1.00      0.23      0.38       188
          10       1.00      0.01      0.02       185
          11       1.00      0.04      0.08        95

    accuracy                           0.58      5676
   macro avg       0.89      0.31      0.37      5676
weighted avg       0.74      0.58      0.52      5676



Logistic Regression Classifier



In [None]:
logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
logreg.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = logreg.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.5821000704721635
              precision    recall  f1-score   support

           1       0.80      0.44      0.56      2168
           2       0.54      0.56      0.55       655
           3       0.65      0.84      0.73       577
           4       0.84      0.75      0.79       532
           5       0.62      0.66      0.64       424
           6       0.45      0.65      0.53       355
           7       0.30      0.58      0.39       272
           8       0.49      0.70      0.58       225
           9       0.42      0.69      0.53       188
          10       0.35      0.61      0.44       185
          11       0.25      0.43      0.32        95

    accuracy                           0.58      5676
   macro avg       0.52      0.63      0.55      5676
weighted avg       0.65      0.58      0.59      5676



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Support Vector Machine

In [None]:
svm = LinearSVC(class_weight='balanced') #instantiate a support vector machine model
svm.fit(X_train_dtm, y_train) #fit the model with training data

#Make predictions on test data
y_pred_class = svm.predict(X_test_dtm)

#calculate evaluation measures:
print("Accuracy: ", accuracy_score(y_test, y_pred_class))
print(classification_report(y_test, y_pred_class))

Accuracy:  0.6280831571529246
              precision    recall  f1-score   support

           1       0.77      0.58      0.66      2168
           2       0.57      0.60      0.58       655
           3       0.71      0.82      0.76       577
           4       0.86      0.78      0.82       532
           5       0.57      0.66      0.61       424
           6       0.51      0.60      0.55       355
           7       0.36      0.48      0.41       272
           8       0.59      0.67      0.63       225
           9       0.46      0.68      0.55       188
          10       0.36      0.54      0.44       185
          11       0.29      0.38      0.33        95

    accuracy                           0.63      5676
   macro avg       0.55      0.62      0.57      5676
weighted avg       0.66      0.63      0.63      5676

