In [2]:
#importing list type from typing library
from typing import List
from textwrap import wrap


In [3]:
#importing numpy and pandas library for data preparation
import numpy as np
import pandas as pd

In [4]:
#importing visualization library for this application
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
#taking metrics from sklearn library
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [6]:
#setting up bacground 
sns.set(style = 'darkgrid',
        palette = 'pastel',
        font_scale = 2,
        rc = {
            'figure.figsize': (14,10),
            'axes.labelsize':16
        })

here we are making a confusion_matrix to evaluate the models and their accuracy 

In [7]:
def confusion_matrix(ytrue: np.ndarray, yhat: np.ndarray, figsize = (15, 10)):
    # Getting labels
    labels = sorted(list(set(ytrue)))
    
    # Creating confusion matrix
    osmodel = pd.DataFrame(confusion_matrix(ytrue, y_hat), index = labels, columns = labels)
    
    # Accuracy and F1-score
    acc = accuracy_score(y_true, y_hat)
    f1s = f1_score(y_true, y_hat, average = 'weighted')
    
    # Ploting Confusin matrix
    fig, ax = plt.subplots(figsize = figsize)
    sns.heatmap(osmodel, annot = True, square = True, fmt = 'd', linewidths = 1,cbar = False, cmap = 'Paired', ax = ax)
    ax.set(title = f'Accuracy: {acc:.2f}, F1 score: {f1s:.2f}', xlabel = 'Predicted', ylabel = 'Actual')
    fig.suptitle('Confusion Matrix')
    plt.tight_layout()


here we are taking out the top features that are having more accuracy and for this i am using get_top_features fuction

In [8]:
def get_top_features(vectoriser, clf, selector = None, top_n: int = 25, how: str = 'long'):
    """
    Convenience function to extract top_n predictor per class from a model.
    """

    assert hasattr(vectoriser, 'get_feature_names')
    assert hasattr(clf, 'coef_')
    assert hasattr(selector, 'get_support')
    assert how in {'long', 'wide'}, f'how must be either long or wide not {how}'

    features = vectoriser.get_feature_names_out()
    if selector is not None:
        features = features[selector.get_support()]
    axis_names = [f'freature_{x + 1}' for x in range(top_n)]

    if len(clf.classes_) > 2:
        results = list()
        for c, coefs in zip(clf.classes_, clf.coef_):
            idx = coefs.argsort()[::-1][:top_n]
            results.extend(tuple(zip([c] * top_n, features[idx], coefs[idx])))
    else:
        coefs = clf.coef_.flatten()
        idx = coefs.argsort()[::-1][:top_n]
        results = tuple(zip([clf.classes_[1]] * top_n, features[idx], coefs[idx]))

    dflambda = pd.DataFrame(results, columns =  ['sdg', 'feature', 'coef'])

    if how == 'wide':
        dflambda = pd.DataFrame(
            np.array_split(dflambda['feature'].values, len(dflambda) / top_n),
            index = clf.classes_ if len(clf.classes_) > 2 else [clf.classes_[1]],
            columns = axis_names
        )

    return dflambda


In [9]:
#defining the 1sustainable development goals and here we are taking 15 goals
def fixsdgname(sdg: str, width: int = 30) -> str:
    sdg_id2name = {
        1: 'GOAL 1: No Poverty',
        2: 'GOAL 2: Zero Hunger',
        3: 'GOAL 3: Good Health and Well-being',
        4: 'GOAL 4: Quality Education',
        5: 'GOAL 5: Gender Equality',
        6: 'GOAL 6: Clean Water and Sanitation',
        7: 'GOAL 7: Affordable and Clean Energy',
        8: 'GOAL 8: Decent Work and Economic Growth',
        9: 'GOAL 9: Industry, Innovation and Infrastructure',
        10: 'GOAL 10: Reduced Inequality',
        11: 'GOAL 11: Sustainable Cities and Communities',
        12: 'GOAL 12: Responsible Consumption and Production',
        13: 'GOAL 13: Climate Action',
        14: 'GOAL 14: Life Below Water',
        15: 'GOAL 15: Life on Land',
        16: 'GOAL 16: Peace and Justice Strong Institutions',
        17: 'GOAL 17: Partnerships to achieve the Goal'
    }
    name = sdg_id2name[int(sdg)]
    return '<br>'.join(wrap(name, 30))

In [11]:
#importing all the libraries that are requitred for this dataset and aevaluation
from typing import List
import numpy as np
import pandas as pd
import plotly.express as px
import spacy
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, top_k_accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import re
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
sns.set_theme()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
#downloading the dataset and imprting into this notebook this is the community dataset developed by so many researchers and it is aopensouirce community dataset and we will make a pretrained model on this
from tqdm import tqdm
osmodel = pd.read_csv('https://zenodo.org/record/5550238/files/osdg-community-dataset-v21-09-30.csv?download=1') 
osmodel.head()

Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement
0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"From a gender perspective, Paulgaard points ou...",5,1,7,0.75
1,10.18356/eca72908-en,00028349a7f9b2485ff344ae44ccfd6b,Labour legislation regulates maximum working h...,11,2,1,0.333333
2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,6,0.714286
3,10.1787/5k9b7bn5qzvd-en,0006a887475ccfa5a7f5f51d4ac83d02,The extent to which they are akin to corruptio...,3,1,2,0.333333
4,10.1787/9789264258211-6-en,0006d6e7593776abbdf4a6f985ea6d95,A region reporting a higher rate will not earn...,3,2,2,0.0


so here we can see that so many labels are there and negative labels ratio is higher than positive labels ratio and so we are going to take agreement value and train on these features

In [14]:
#printing the info about the model 
print(osmodel.info())
print(osmodel.isna())
print(osmodel.shape)
osmodel.head()
sorted(osmodel["sdg"].unique())
textsentence = osmodel.text.values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32121 entries, 0 to 32120
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   doi              32121 non-null  object 
 1   text_id          32121 non-null  object 
 2   text             32121 non-null  object 
 3   sdg              32121 non-null  int64  
 4   labels_negative  32121 non-null  int64  
 5   labels_positive  32121 non-null  int64  
 6   agreement        32121 non-null  float64
dtypes: float64(1), int64(3), object(3)
memory usage: 1.7+ MB
None
         doi  text_id   text  ...  labels_negative  labels_positive  agreement
0      False    False  False  ...            False            False      False
1      False    False  False  ...            False            False      False
2      False    False  False  ...            False            False      False
3      False    False  False  ...            False            False      False
4      False    

In [18]:
#finding out the cumulative sum and agreeement
osdgcum = osmodel['agreement'].value_counts(normalize=True).sort_index().cumsum().to_frame(name='probabilitysum')
osdgcum.reset_index(inplace=True)
osdgcum.rename({'index':'agreement'},axis = 1, inplace=True)
osdgcum.shape
osdgcum.head(2)

Unnamed: 0,agreement,probabilitysum
0,0.6,0.043289
1,0.636364,0.043347


In [16]:
#taking out a figure plot for cumulative probablility and agreement score
fig = px.line(
    data_frame = osdgcum,
    x = 'agreement',
    y = 'probabilitysum',
  
    labels = {
        'agreement': 'Agreement Score',
        'probabilitysum': 'Cumulative Probrability'
    },
  
)

fig.update_traces(hovertemplate = 'Agreement score: %{x:.2f}<br>Cumulative probability: %{y:.2f}')
fig.update_layout(
    xaxis = {'dtick': 0.1},
    yaxis = {'dtick': 0.25}
)
fig.show()

In [17]:
#now filtering out those queries that are greater than 0.6
osmodel = osmodel.query('agreement >= 0.6 and  labels_positive>labels_negative')
print("now the sha[pe is going to be",osmodel.shape)
osmodel.head(2)

now the sha[pe is going to be (17233, 7)


Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement
0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"From a gender perspective, Paulgaard points ou...",5,1,7,0.75
2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,6,0.714286


In [20]:
#grouping by based on the textid and count recieved 
osdgcum = osmodel.groupby('sdg',as_index=False).agg(count = ('text_id','count'))
osdgcum['share'] = osdgcum['count'].divide(osdgcum['count'].sum()).multiply(100)
osdgcum.shape
osdgcum.head(2)

Unnamed: 0,sdg,count,share
0,1,1146,6.650032
1,2,827,4.798932


In [21]:
#taking the graph about how many count of sdg we are having in th dataset
fig = px.bar(
    data_frame = osdgcum,
    x = 'sdg',
    y = 'count',
    custom_data = ['share'],
    labels = {
        'sdg': 'SDG',
        'count': 'Count'
    },
    color_discrete_sequence = ['#1f77b4'],
    title = 'Figure 2. Distribution of Texts (Agreement >.6) over SDGs'
)

fig.update_traces(hovertemplate = 'SDG %{x}<br>Count: %{y}<br>Share: %{customdata:.2f}%')
fig.update_layout(xaxis = {'type': 'category'})
fig.show()



In [22]:
#now importing spacy library and taking visualization library 
import plotly.io as pio 
pio.templates.default = 'plotly_white'

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', disable = ['ner'])

print('SpaCy version:', spacy.__version__)

SpaCy version: 2.2.4


In [23]:
#writing this fucntion will perform text preprocessing and text lemmatization pos tagging
def preprocess_spacy(alpha: List[str]) -> List[str]:
    """
    Preprocess text input using spaCy.
    
    Parameters
    ----------
    alpha: List[str]
        a text corpus.
    
    Returns
    -------
    doc: List[str]
        a cleaned version of the original text corpus.
    """
    docs = list()
    
    for doc in tqdm(nlp.pipe(alpha, batch_size = 128)):
        tokens = list()
        for token in doc:
            if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
                tokens.append(token.lemma_)
        docs.append(' '.join(tokens))
        
    return docs

In [24]:
#taking the text value of the dataset to the docs
osmodel['docs'] = preprocess_spacy(osmodel['text'].values)
osmodel.shape
display(osmodel.head())



17233it [02:17, 125.30it/s]


Unnamed: 0,doi,text_id,text,sdg,labels_negative,labels_positive,agreement,docs
0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"From a gender perspective, Paulgaard points ou...",5,1,7,0.75,gender perspective point labour market fishing...
2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,6,0.714286,average figure mask large difference region nu...
7,10.1787/9789264117563-8-en,000bfb17e9f3a00d4515ab59c5c487e7,The Israel Oceanographic and Limnological Rese...,6,0,3,1.0,station monitor quantity quality water coastli...
8,10.18356/805b1ae4-en,001180f5dd9a821e651ed51e30d0cf8c,Previous chapters have discussed ways to make ...,2,0,3,1.0,previous chapter discuss way make food system ...
11,10.1787/9789264310278-en,001f1aee4013cb098da17a979c38bc57,Prescription rates appear to be higher where l...,8,0,3,1.0,prescription rate appear high labour force par...


In [25]:
#splitting the docs and sdg which is text and sdg because they are only having preferred labels and accuracy and so we are taking text and sdg as train datset and test dataset
x_train, x_test, y_train, y_test = train_test_split(
    osmodel['docs'].values, 
    osmodel['sdg'].values, 
    test_size = .3,
    random_state = 42
)

In [26]:
#we can see the shape given below
x_train.shape
x_test.shape

(5170,)

In [40]:
#taking the pipeline fuinction and implementing logistic regression pipeline
pipe = Pipeline([
    ('vectoriser', TfidfVectorizer(
        ngram_range = (1, 2),
        max_df = 0.75,
        min_df = 2,
        max_features = 100_000
    )),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', LogisticRegression(
        penalty = 'l2',
        C = .9,
        multi_class = 'multinomial',
        class_weight = 'balanced',
        random_state = 42,
        solver = 'newton-cg',
        max_iter = 100
    ))
])

pipe.fit(x_train, y_train)


Pipeline(steps=[('vectoriser',
                 TfidfVectorizer(max_df=0.75, max_features=100000, min_df=2,
                                 ngram_range=(1, 2))),
                ('selector', SelectKBest(k=5000)),
                ('clf',
                 LogisticRegression(C=0.9, class_weight='balanced',
                                    multi_class='multinomial', random_state=42,
                                    solver='newton-cg'))])

In [28]:
#by confusion matrix we can see that it is having accuracy
yhat = pipe.predict(x_test)
confusion_matrix(y_test,yhat)

array([[255,   1,   5,   3,   6,   2,   0,  13,   4,  36,   7,   3,   2,
          0,   0],
       [  9, 221,   2,   1,   3,   2,   0,   5,   2,   2,   0,   1,   3,
          2,   3],
       [  6,   6, 505,   4,  14,   1,   1,   4,  12,   4,   9,   4,   2,
          0,   0],
       [  4,   1,   4, 640,   6,   0,   2,  11,  18,   6,   0,   3,   3,
          0,   3],
       [  5,   2,   4,  12, 637,   0,   0,  25,   2,   6,   2,   1,   1,
          0,   0],
       [  2,   9,   4,   2,   1, 344,   2,   1,   2,   1,   8,   2,   6,
          0,   4],
       [  1,   3,   2,   2,   1,   4, 408,   4,  17,   1,   8,  12,  12,
          0,   1],
       [  4,   4,   4,   7,  12,   0,   0, 173,  21,  13,   3,   2,   1,
          0,   1],
       [  0,   3,   0,   4,   1,   0,   6,   9, 166,   2,   8,   5,   6,
          1,   0],
       [  6,   1,   5,   2,   4,   0,   0,  22,   3,  80,   3,   0,   0,
          0,   0],
       [  0,   1,   5,   2,   2,   7,   4,   6,  13,   7, 314,   6,   8,
       

In [29]:

print(classification_report(y_test, yhat, zero_division = 0))


              precision    recall  f1-score   support

           1       0.87      0.76      0.81       337
           2       0.83      0.86      0.85       256
           3       0.93      0.88      0.91       572
           4       0.94      0.91      0.93       701
           5       0.93      0.91      0.92       697
           6       0.92      0.89      0.90       388
           7       0.93      0.86      0.89       476
           8       0.62      0.71      0.66       245
           9       0.61      0.79      0.69       211
          10       0.50      0.63      0.56       126
          11       0.85      0.83      0.84       379
          12       0.58      0.78      0.66        77
          13       0.83      0.86      0.84       310
          14       0.96      0.91      0.93       220
          15       0.85      0.83      0.84       175

    accuracy                           0.85      5170
   macro avg       0.81      0.83      0.82      5170
weighted avg       0.86   

according to this we can see that this is having more accuracy as it is 0.85

In [30]:
osdgcum = get_top_features(pipe['vectoriser'], pipe['clf'], pipe['selector'], top_n = 15)
print('Shape:', osdgcum.shape)
osdgcum.head()

Shape: (225, 3)


Unnamed: 0,sdg,feature,coef
0,1,poverty,12.359248
1,1,poor,7.487851
2,1,child,4.959211
3,1,deprivation,4.511084
4,1,income,4.473869


In [31]:
#here we are plotting which are having more a=occurences and which sgd they belong to 
import plotly.express as px

osdgcum.sort_values(['sdg', 'coef'], ignore_index = True, inplace = True)

colors = px.colors.qualitative.Dark24[:15]
template = 'SDG: %{customdata}<br>Feature: %{y}<br>Coefficient: %{x:.2f}'

fig = px.bar(
    data_frame = osdgcum,
    x = 'coef',
    y = 'feature',
    custom_data = ['sdg'],
    facet_col = 'sdg',
    facet_col_wrap = 3,
    #facet_col_spacing = .15,
    orientation = 'h',
    height = 1200,
    labels = {
        'coef': 'Coefficient',
        'feature': ''
    },
    title = '15 predictors'
)

fig.for_each_trace(lambda x: x.update(hovertemplate = template))
fig.for_each_trace(lambda x: x.update(marker_color = colors.pop(0)))
fig.update_yaxes(matches = None, showticklabels = True)

fig.show()

In [32]:
osmodel.loc[osmodel.sdg == 15].sample(10)[['text','sdg']]

Unnamed: 0,text,sdg
30778,"The Law regulates breeding, protection, huntin...",15
5873,In addition to difficulties linked to implemen...,15
12181,The lack of knowledge on such transfrontier mi...,15
26832,The EU is also a major destination for Bosnia ...,15
26654,Some of the indicators compile information abo...,15
9119,A new Forestry Act (LMD 2005] was approved in ...,15
14285,The same is true for farmland: high-yield crop...,15
23366,"Circassian walnut (Juglans regia), occupies a ...",15
19669,This is the most demanding use of values and r...,15
28995,There is no socialisation of the forest.” Inde...,15


In [33]:
osmodel.loc[osmodel.sdg == 10].sample(10)[['text','sdg']]
osmodel.loc[osmodel.sdg == 11].sample(10)[['text','sdg']]
                                            

Unnamed: 0,text,sdg
11798,The house is the space where children spend mo...,11
32100,"In travel time figures, markers refer to avera...",11
5226,In the interest of effective transport plannin...,11
7170,Vehicle and service improvements in the system...,11
27818,This is the concept of risk and is computed as...,11
13400,What is essential about these is that they hav...,11
26381,"Of Kaskelen’s employed population, 31% commute...",11
24131,"In the urban core and surrounding districts, a...",11
15065,These forward looking analyses suggest that ne...,11
4835,This was not an issue when Clermont-Ferrand wa...,11


In [34]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

pipelinesgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))
    
])
pipelinesgd.fit(x_train,y_train)

yhat = pipelinesgd.predict(x_test)
confusion_matrix(y_test,yhat)
print(classification_report(y_test, yhat,zero_division = 0))

              precision    recall  f1-score   support

           1       0.82      0.82      0.82       337
           2       0.82      0.80      0.81       256
           3       0.89      0.93      0.91       572
           4       0.85      0.97      0.90       701
           5       0.85      0.97      0.90       697
           6       0.84      0.94      0.89       388
           7       0.86      0.92      0.89       476
           8       0.75      0.49      0.59       245
           9       0.84      0.53      0.65       211
          10       0.98      0.33      0.49       126
          11       0.84      0.84      0.84       379
          12       0.94      0.44      0.60        77
          13       0.80      0.86      0.83       310
          14       0.92      0.89      0.91       220
          15       0.90      0.78      0.83       175

    accuracy                           0.85      5170
   macro avg       0.86      0.77      0.79      5170
weighted avg       0.85   

In [35]:
from sklearn.naive_bayes import MultinomialNB
pipelinebayes = Pipeline([
    ('vectoriser', TfidfVectorizer(
        ngram_range = (1, 2),
        max_df = 0.75,
        min_df = 2,
        max_features = 100_000
    )),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', MultinomialNB()
    )
])
pipelinebayes.fit(x_train,y_train)
yhat = pipelinebayes.predict(x_test)
pred3 = yhat
confusion_matrix(y_test, yhat)
print(classification_report(y_test, yhat, zero_division = 0))

              precision    recall  f1-score   support

           1       0.80      0.71      0.75       337
           2       0.92      0.56      0.70       256
           3       0.89      0.89      0.89       572
           4       0.67      0.97      0.80       701
           5       0.61      0.97      0.75       697
           6       0.78      0.89      0.83       388
           7       0.66      0.93      0.77       476
           8       0.82      0.13      0.22       245
           9       1.00      0.13      0.23       211
          10       0.00      0.00      0.00       126
          11       0.86      0.78      0.82       379
          12       1.00      0.01      0.03        77
          13       0.79      0.79      0.79       310
          14       0.97      0.58      0.72       220
          15       0.98      0.31      0.48       175

    accuracy                           0.74      5170
   macro avg       0.78      0.58      0.58      5170
weighted avg       0.77   

In [36]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

pipelinesgdclass = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))
    
])

pipelinesgdclass.fit(x_train, y_train)

y_hat = pipelinesgdclass.predict(x_test)
pred1 = y_hat
confusion_matrix(y_test, yhat)

print(classification_report(y_test, yhat, zero_division = 0))

              precision    recall  f1-score   support

           1       0.80      0.71      0.75       337
           2       0.92      0.56      0.70       256
           3       0.89      0.89      0.89       572
           4       0.67      0.97      0.80       701
           5       0.61      0.97      0.75       697
           6       0.78      0.89      0.83       388
           7       0.66      0.93      0.77       476
           8       0.82      0.13      0.22       245
           9       1.00      0.13      0.23       211
          10       0.00      0.00      0.00       126
          11       0.86      0.78      0.82       379
          12       1.00      0.01      0.03        77
          13       0.79      0.79      0.79       310
          14       0.97      0.58      0.72       220
          15       0.98      0.31      0.48       175

    accuracy                           0.74      5170
   macro avg       0.78      0.58      0.58      5170
weighted avg       0.77   

In [37]:
logisticpipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf', LogisticRegression(
        penalty = 'l2',
        C = .9,
        multi_class = 'multinomial',
        class_weight = 'balanced',
        random_state = 42,
        solver = 'newton-cg',
        max_iter = 100
    ))  
])
logisticpipeline.fit(x_train,y_train)
yhat = logisticpipeline.predict(x_test)
pred2 = yhat
confusion_matrix(y_test, yhat)
print(classification_report(y_test, yhat,zero_division = 0))

              precision    recall  f1-score   support

           1       0.89      0.78      0.83       337
           2       0.83      0.88      0.86       256
           3       0.93      0.90      0.92       572
           4       0.94      0.91      0.93       701
           5       0.93      0.91      0.92       697
           6       0.93      0.89      0.91       388
           7       0.92      0.87      0.90       476
           8       0.62      0.69      0.65       245
           9       0.63      0.79      0.70       211
          10       0.51      0.65      0.57       126
          11       0.86      0.82      0.84       379
          12       0.56      0.74      0.64        77
          13       0.85      0.87      0.86       310
          14       0.95      0.91      0.93       220
          15       0.82      0.83      0.82       175

    accuracy                           0.86      5170
   macro avg       0.81      0.83      0.82      5170
weighted avg       0.87   

In [38]:
randomforest = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
               ('tfidf', TfidfTransformer()),
               ('rf', RandomForestClassifier(n_estimators=50)),
               ])
randomforest.fit(x_train,y_train)
predict = randomforest.predict(x_test)
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           1       0.77      0.81      0.79       337
           2       0.74      0.75      0.75       256
           3       0.86      0.90      0.88       572
           4       0.85      0.96      0.90       701
           5       0.83      0.96      0.89       697
           6       0.85      0.92      0.89       388
           7       0.82      0.88      0.85       476
           8       0.64      0.50      0.56       245
           9       0.78      0.53      0.63       211
          10       0.87      0.32      0.47       126
          11       0.82      0.82      0.82       379
          12       0.96      0.31      0.47        77
          13       0.79      0.83      0.81       310
          14       0.94      0.86      0.90       220
          15       0.92      0.61      0.74       175

    accuracy                           0.82      5170
   macro avg       0.83      0.73      0.76      5170
weighted avg       0.82   

In [41]:
gradientboostmodel = Pipeline([('vect', CountVectorizer(min_df=5, ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer()),
                    ('gb', GradientBoostingClassifier(n_estimators=50)),
                    ])

gradientboostmodel.fit(x_train, y_train)
pred = gradientboostmodel.predict(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.80      0.81      0.80       337
           2       0.82      0.76      0.79       256
           3       0.77      0.89      0.83       572
           4       0.89      0.92      0.90       701
           5       0.88      0.94      0.91       697
           6       0.83      0.92      0.88       388
           7       0.88      0.85      0.86       476
           8       0.60      0.58      0.59       245
           9       0.76      0.56      0.64       211
          10       0.59      0.43      0.50       126
          11       0.80      0.78      0.79       379
          12       0.57      0.42      0.48        77
          13       0.76      0.81      0.78       310
          14       0.96      0.83      0.89       220
          15       0.86      0.69      0.76       175

    accuracy                           0.82      5170
   macro avg       0.78      0.74      0.76      5170
weighted avg       0.82   

In [45]:
#now after getting al the accuracies let us compare which classifier is having good classification
log_accuracy = accuracy_score(pred2, y_test)
sgd_accuracy = accuracy_score(pred1, y_test)
naivebayes_accuracy = accuracy_score(pred3, y_test)
randomforest_accuracy = accuracy_score(predict, y_test)
gradientboost_accuracy = accuracy_score(pred, y_test)
print(log_accuracy)
print(sgd_accuracy)
print(naivebayes_accuracy)
print(randomforest_accuracy)
print(gradientboost_accuracy)


0.8586073500967119
0.8491295938104448
0.7390715667311412
0.8237911025145068
0.8187620889748549


so by this we cans see that logistic is having more accuracy


In [43]:
osmodel['docs'] = preprocess_spacy(osmodel['text'].values)


17233it [02:09, 133.50it/s]
