#### We're going to predict the "exact keyword" for a course from its "titles"

In [1]:
# Import required modules
import pandas as pd
import re
from nltk.corpus import stopwords
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
import zipfile
import pickle

In [2]:
# Read in the file
df = pd.read_csv("courseWithKeyword.csv").drop("Unnamed: 3", axis=1)
df = df.dropna()
df.head()

Unnamed: 0,Titles,Exact Keyword,broadKeyword
0,Proofreading & copy editing course (Level 4 Di...,proofreading,copy editing
1,Public relations course (Level 4 Diploma),public relations,public relation
2,Learn Arabic,arabic,language
3,Learn French includes MP3 Downloads,french,language
4,Learn German,german,language


In [3]:
# Cleaning and processing

# "[^a-zA-Z0-9]" replaces all except words and digits
# "(\s\d*)" replaces digits if it has at least 1 space before it.
# "(\s{2,})" replaces at least 2 or more digits with a single space
df.Titles = df.Titles.str.replace("[^a-zA-Z0-9]", " ").str.replace("(\s\d*)", " ").str.replace("(\s{2,})", " ")

# Lowecase titles
df.Titles = df.Titles.str.lower().str.strip()

In [4]:
# Stopwords
STOPWORDS = set(stopwords.words("english"))

# We are searching in a dictionary rather than a set which is basically a hashmap. And in hashmap the search time is O(1)
STOPWORDS = Counter(STOPWORDS)

# Remove stopwords 
df.Titles = df.Titles.apply(lambda x: [item for item in x.split() if item not in STOPWORDS]).str.join(" ")

In [5]:
# Extract feature vector and response vector
X = df.Titles
y = df["Exact Keyword"]

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state = 2)

In [6]:
# Our ist moddel will be sgd classifier. We create a pipeline for that
sgd = Pipeline([("vect", CountVectorizer()),
                ("tfidf", TfidfTransformer()),
                ("clf", SGDClassifier(loss="hinge",
                                      penalty="l2",
                                      alpha=1e-3,
                                      random_state=42,
                                      max_iter=5,
                                      tol=None)),
              ])

# Fit the model
sgd.fit(X_train, y_train)

# Make prediction on test set
y_pred_sgd = sgd.predict(X_test)

# Calculate sgd accuracy
sgd_accuracy = round(accuracy_score(y_pred_sgd, y_test)*100, 2)

print(f"Sgd accuracy is: {sgd_accuracy}")

Sgd accuracy is: 61.43


In [7]:
# Train logistic regression now
logreg = Pipeline([("vect", CountVectorizer()),
                ("tfidf", TfidfTransformer()),
                ("clf", LogisticRegression(n_jobs=1, C=1e5)),
               ])

# Fit the model
logreg.fit(X_train, y_train)

# Make prediction on test set
y_pred_logreg = logreg.predict(X_test)

# Calculate accuracy
logreg_accuracy = round(accuracy_score(y_pred_logreg, y_test)*100, 2)

print(f"Logistic regression accuracy is: {logreg_accuracy}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic regression accuracy is: 67.8


In [8]:
# Save the logistic regression model for future use
# with open("LogisticRegressionModel.pickle", "wb") as f:
#     pickle.dump(logreg, f) # This will be a zip file after creating as .pickle file to save space

In [14]:
# Decompress the file and predict a category
logreg.predict(["Master french language"])

array(['french'], dtype=object)