In [None]:
#Example of multiclass prediction using text variables
# https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/melyg/Desktop/Networks/Data/fields_for_micro.csv')
print(df.shape)
#df = df1.sample(frac =.6)
print(df.shape)
df.head()

In [None]:
#We need only two columns, cleaning and dictionaries
from io import StringIO

col = ['field_micro', 'pubjournal', 'pubtitle']
df = df[col]
df = df[pd.notnull(df['pubtitle'])] #removes null values
df = df[pd.notnull(df['pubjournal'])] #removes null values
df.columns = ['field_micro', 'pubjournal', 'pubtitle']
df['category_id'] = df['field_micro'].factorize()[0] #Assigns a number to categories starting at zero
category_id_df = df[['field_micro', 'category_id']].drop_duplicates().sort_values('category_id') #Descriptive table of number of categories and number of cases
category_to_id = dict(category_id_df.values) #dictionary of categories
id_to_category = dict(category_id_df[['category_id', 'field_micro']].values)  #dictionary of categories
df.head()

In [None]:
#Checking 
array = df['category_id'].to_numpy()
print(np.unique(array, return_counts=True))
frequency = df.category_id.value_counts()
print(frequency)

In [None]:
#Imbalanced clases (visualization)
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('field_micro').pubtitle.count().plot.bar(ylim=0)
plt.show()

In [None]:
#Text representation: texts are converted to a more manageable representation
#Use the bag of words model: a model where for each document (text var) the presence (and often the frequency) of words is taken into consideration, but the order in which they occur is ignored
#we will calculate a measure called Term Frequency, Inverse Document Frequency, abbreviated to tf-idf using "sklearn.feature_extraction.text.TfidfVectorizer"

# sublinear_df is set to True to use a logarithmic form for frequency.
# min_df is the minimum numbers of documents a word must be present in to be kept.
# norm is set to l2, to ensure all our feature vectors have a euclidian norm of 1.
# ngram_range is set to (1, 2) to indicate that we want to consider both unigrams and bigrams.
# stop_words is set to "english" to remove all common pronouns ("a", "the", ...) to reduce the number of noisy features.

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features1 = tfidf.fit_transform(df.pubtitle).toarray()
labels = array
print(features1.shape)
del array


In [None]:
#We can use sklearn.feature_selection.chi2 to find the terms that are the most correlated with each of the products
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for field_micro, category_id in sorted(category_to_id.items()):
  features_chi2_1 = chi2(features1, labels == category_id)
  indices_1 = np.argsort(features_chi2_1[0])
  feature_names_1 = np.array(tfidf.get_feature_names_out())[indices_1]
  unigrams_1 = [v for v in feature_names_1 if len(v.split(' ')) == 1]
  bigrams_1 = [v for v in feature_names_1 if len(v.split(' ')) == 2]
  print("# '{}':".format(field_micro))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams_1[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams_1[-N:])))

In [None]:
#Do same for pubjournal
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features2 = tfidf.fit_transform(df.pubjournal).toarray()
print(features2.shape)

In [None]:
#Do same for pubjournal
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for field_micro, category_id in sorted(category_to_id.items()):
  features_chi2_2 = chi2(features2, labels == category_id)
  indices_2 = np.argsort(features_chi2_2[0])
  feature_names_2 = np.array(tfidf.get_feature_names_out())[indices_2]
  unigrams_2 = [v for v in feature_names_2 if len(v.split(' ')) == 1]
  bigrams_2 = [v for v in feature_names_2 if len(v.split(' ')) == 2]
  print("# '{}':".format(field_micro))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams_2[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams_2[-N:])))

In [None]:
#Creating one array for features
features_final = np.concatenate((features1, features2), axis=1)
features_final.shape

In [None]:
# Multiclass
#Naive Bayes Classifier (using only pubtitle)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
X_train, X_test, y_train, y_test = train_test_split(df['pubtitle'], df['field_micro'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
print(clf.predict(count_vect.transform(["a regulatory policy strategy for protecting immigrant workers"])))
print(clf.predict(count_vect.transform(["a relative question: the developing world is reevaluating what it means to be poor"])))
print(clf.predict(count_vect.transform(["10 central bank independence: growing threats"])))

In [None]:
#Deleting previous files to deal with memory problems
del df1
del features1, features2

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
   #LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features_final, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
#Using the best model
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features_final, labels, df.index, test_size=0.25, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.field_micro.values, yticklabels=category_id_df.field_micro.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from IPython.display import display
for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]])
      print('')

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))