<a href="https://colab.research.google.com/github/mchhour31/NN_work/blob/main/nlp_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from sklearn.datasets import fetch_20newsgroups

df_train = fetch_20newsgroups(subset='train', shuffle=True)
X_train = df_train.data
y_train = df_train.target

df_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test = df_test.data
y_test = df_test.target

In [5]:
df_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
df_test.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Naive Approach

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# bag of words model
cv = CountVectorizer()
X_train_count = cv.fit_transform(X_train)
X_train_count.shape

(11314, 130107)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

# tf-idf (penalises repeating words)
tfidf = TfidfTransformer()
X_train_idf = tfidf.fit_transform(X_train_count)
X_train_idf.shape

(11314, 130107)

In [9]:
tfidf.fit_transform(cv.fit_transform(X_test)).shape

(7532, 93564)

In [10]:
from sklearn.naive_bayes import MultinomialNB

# naive bayes classifier
clf = MultinomialNB().fit(X_train_idf, y_train)

## Pipeline of Naive Approach

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', MultinomialNB())
])

text_clf = text_clf.fit(X_train, y_train)
prediction = text_clf.predict(X_test)
print(np.mean(prediction == y_test))

0.7738980350504514


## Using SVM for classification

In [16]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', SGDClassifier(penalty='l2', alpha=1e-3, max_iter=1000, random_state=0)) # ridge
])

y_pred = text_clf.fit(X_train, y_train).predict(X_test)
print(np.mean(y_test == y_pred))

0.8238183749336165


In [19]:
from sklearn.model_selection import GridSearchCV # exhaustive search through values, selects best one

parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [22]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

ValueError: ignored