In [1]:
from ThematicTextClassify.TextClassifier import *
from ThematicTextClassify.Preprocessing import *



In [2]:
df =pd.read_csv('Categorized_Links.csv')
# add the title and description column together to form a text document
df['Text'] = df['Title']+ df['Description']
df = df.dropna(subset= ['Text'], axis = 0)

# preprocess the newly defined Text column
# preprocess_text is a function I build and saved in TextClassifier.py

df['Processed Text'] = df['Text'].map(preprocess_text)

df['processed_string'] =  [' '.join(text) for text in df['Processed Text']]

df['Class'] = ""
df['Class'] = df.apply(lambda df: 'Distribution' if (df['Category'] == 'Distribution') else df['Class'], axis =1)
df['Class'] = df.apply(lambda df: 'Other' if (df['Category'] != 'Distribution') else df['Class'], axis =1)
df = df.reset_index(drop=True)
df = df.sort_values('Class')
df = df.reset_index(drop=True)
df = df.drop_duplicates(['Link'],keep= 'first')
df = df.reset_index(drop=True)
len(df)

301

In [3]:
text_train, text_test, class_train, class_test = train_test_split(df,
                                                    df['Class'],
                                                    test_size=0.15, 
                                                    random_state=5131)

# Combining Classifier Predictions 

In [4]:
# Load data to be classified 
full = pd.read_csv("NewData.csv")

# combine the title and description as text as our document
full['Text'] = full['Title'] +full['Description']
full['Processed Text'] = full['Text'].map(preprocess_text)
full['processed_string'] =  [' '.join(text) for text in full['Processed Text']]
full = full.reset_index(drop=True)

In [5]:
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.75, min_df =1, ngram_range = (1,3)),MultinomialNB(alpha=0.75), text_train, class_train, full )
model1 = full[full['Class'] == 'Distribution']


full['Class'] = feature_pipe(CountVectorizer(max_df= 0.25, min_df =3, ngram_range = (1,2)),LinearSVC(C=0.1), text_train, class_train, full)
model2 = full[full['Class'] == 'Distribution']



full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.25, min_df =3, ngram_range = (1,3)),LinearSVC(C=0.01), text_train, class_train, full)
model3 = full[full['Class'] == 'Distribution']


full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,1)),XGBClassifier(max_depth = 7, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7), text_train, class_train, full)
model4 = full[full['Class'] == 'Distribution']

In [6]:
frames = [model1, model2, model3, model4]

result_frame = pd.concat(frames)
print("Length of classified data set before dropping duplicates: ", len(result_frame))
result_frame = result_frame.drop_duplicates(['Link'],keep= 'last')
print("Length of classified data set after dropping duplicates: ", len(result_frame))

Length of classified data set before dropping duplicates:  387
Length of classified data set after dropping duplicates:  242


In [7]:
distributioncsv = result_frame[['Title', 'Description', 'Link', 'Class']]
distributioncsv.to_csv('Distribution.csv', index = False)

In [8]:
distributioncsv.tail()

Unnamed: 0,Title,Description,Link,Class
1445,Eating out – How often and why,Eating out at restaurants or purchasing takeou...,https://www150.statcan.gc.ca/n1/en/catalogue/1...,Distribution
1446,Eating out: Nutrition information on menus and...,Being able to make informed food choices benef...,https://www150.statcan.gc.ca/n1/en/catalogue/1...,Distribution
1509,Eating out – How often and why,Eating out at restaurants or purchasing takeo...,https://www150.statcan.gc.ca/n1/pub/11-627-m/1...,Distribution
1510,Eating out: Nutrition information on menus and...,Being able to make informed food choices bene...,https://www150.statcan.gc.ca/n1/pub/11-627-m/1...,Distribution
1605,Nutritional information on packaged foods,The purpose of the 2016 General Social Survey...,https://www150.statcan.gc.ca/n1/pub/11-627-m/1...,Distribution


# Setting Up Training Data, Holdout, and New data to be Classified (CountVectorizer)
* Note: It is not likely to have individual vectorizer for each model. Therefore we would use countvectorizer with the most popular tuning parameters given from the best models above (optmized using GridSearchCv)

In [9]:
# Training Data 
countvect = CountVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,2))
X_train = countvect.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Distribution", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values

X_test = countvect.transform(text_test['processed_string'])
X_test = X_test.toarray()

y_test = class_test.replace(to_replace = "Distribution", value = 1)
y_test  = y_test.replace(to_replace = "Other", value = 0)
y_test  = y_test.values

X_full = countvect.transform(full['processed_string'])
X_full = X_full.toarray()

# Stacked Classifier (CountVectorizer)
## Using Logistic Regression as meta_classifier

In [10]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.75)
stack_clf2 = LogisticRegression(C=0.5, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.1, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 2)
stack_clf5 = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = LogisticRegression(C=1.0, penalty = 'l2')
sclf_log = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4,stack_clf5, sclf_log]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost','StackingClassifier']


# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_log = sclf_log.fit(X_train,y_train)
prediction_results_int = sclf_log.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Distribution')
    else:
        prediction_results.append('Other')
        
# Print out Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (Logisti Regression Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.66 (+/- 0.04) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.91 (+/- 0.03) [Logistic Regression]
5-fold cross validated Accuracy: 0.91 (+/- 0.04) [Linear SVC]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.90 (+/- 0.03) [XGBoost]
5-fold cross validated Accuracy: 0.91 (+/- 0.03) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (Logisti Regression Meta Classifier)
              precision    recall  f1-score   support

Distribution       0.50      0.33      0.40         9
       Other       0.85      0.92      0.88        37

    accuracy                           0.80        46
   macro avg       0.68      0.63      0.64        46
weighted avg       0.78      0.80      0.79        46



# Stacked Classifier (CountVectorizer)
## Using XGboost as the Meta Classifier

In [12]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.75)
stack_clf2 = LogisticRegression(C=0.5, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.1, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 2)
stack_clf5 = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

sclf_XGB = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_XGB]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost','StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_XGB = sclf_XGB.fit(X_train,y_train)
prediction_results_int = sclf_XGB.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Distribution')
    else:
        prediction_results.append('Other')
        
# Print Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.66 (+/- 0.04) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.91 (+/- 0.03) [Logistic Regression]
5-fold cross validated Accuracy: 0.91 (+/- 0.04) [Linear SVC]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.90 (+/- 0.03) [XGBoost]
5-fold cross validated Accuracy: 0.91 (+/- 0.04) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

Distribution       0.50      0.33      0.40         9
       Other       0.85      0.92      0.88        37

    accuracy                           0.80        46
   macro avg       0.68      0.63      0.64        46
weighted avg       0.78      0.80      0.79        46



# Voting Classifier (CountVectorizer)

In [63]:
vote_clf1 = MultinomialNB(alpha=0.75)
vote_clf2 = LogisticRegression(C=0.5, penalty = 'l2')
vote_clf3 = LinearSVC(C=0.1)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 3)
vote_clf5 =  XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

eclf1 = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost', vote_clf5)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)

classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4, vote_clf5, eclf1]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'VotingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
predicted = eclf1.predict(X_test)
eclf1.score(X_test, y_test)

pred_results = []
for i in predicted:
    if i == 1:
        pred_results.append('Distribution')
    else:
        pred_results.append('Other')
        
# Print Classification report
print("\n")
print("Voting Classifier (CountVect) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.66 (+/- 0.04) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.91 (+/- 0.03) [Logistic Regression]
5-fold cross validated Accuracy: 0.91 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.90 (+/- 0.03) [XGBoost]
5-fold cross validated Accuracy: 0.92 (+/- 0.03) [VotingClassifier]


Voting Classifier (CountVect) Clasification Report
              precision    recall  f1-score   support

Distribution       0.50      0.33      0.40         9
       Other       0.85      0.92      0.88        37

    accuracy                           0.80        46
   macro avg       0.68      0.63      0.64        46
weighted avg       0.78      0.80      0.79        46



# Setting Up Data (tfidfVectorizer)

In [13]:
# Training Data 
tfidf = TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,2))
X_train = tfidf.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Distribution", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values

X_test = tfidf.transform(text_test['processed_string'])
X_test = X_test.toarray()

y_test = class_test.replace(to_replace = "Distribution", value = 1)
y_test  = y_test.replace(to_replace = "Other", value = 0)
y_test  = y_test.values

X_full = tfidf.transform(full['processed_string'])
X_full = X_full.toarray()

# Stacked Classifier (tfidfVectorizer)
## Using Logistic Regression as meta_classifier

In [65]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=0.25, penalty = 'l1')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 3)
stack_clf5 =  XGBClassifier(max_depth = 7, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = LogisticRegression(C=0.25, penalty = 'l1')
sclf_tfidf = StackingClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)


# Predicting new data
sclf_tfidf = sclf_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_tfidf.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Distribution')
    else:
        prediction_results.append('Other')

# Print out Classification report
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.87 (+/- 0.05) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Logistic Regression]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Linear SVC]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.01) [XGBoost]
5-fold cross validated Accuracy: 0.91 (+/- 0.01) [StackingClassifier]


Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)
              precision    recall  f1-score   support

Distribution       0.60      0.33      0.43         9
       Other       0.85      0.95      0.90        37

    accuracy                           0.83        46
   macro avg       0.73      0.64      0.66        46
weighted avg       0.80      0.83      0.81        46



# Stacked Classifier (tfidfVectorizer)
## Using XGboost n as meta_classifier

In [66]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=0.25, penalty = 'l1')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 3)
stack_clf5 =  XGBClassifier(max_depth = 7, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = XGBClassifier(max_depth = 7, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)

sclf_XGB_tfidf  = StackingClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4,stack_clf5],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_XGB_tfidf ]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost','StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data
sclf_XGB_tfidf  = sclf_XGB_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_XGB_tfidf.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Distribution')
    else:
        prediction_results.append('Other')

# Print Classification report    
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.87 (+/- 0.05) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Logistic Regression]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Linear SVC]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.01) [XGBoost]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [StackingClassifier]


Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

Distribution       0.38      0.33      0.35         9
       Other       0.84      0.86      0.85        37

    accuracy                           0.76        46
   macro avg       0.61      0.60      0.60        46
weighted avg       0.75      0.76      0.76        46



# Voting Classifier (tfidfVectorizer)


In [67]:
vote_clf1 = MultinomialNB(alpha=0.25)
vote_clf2 = LogisticRegression(C=0.25, penalty = 'l1')
vote_clf3 = LinearSVC(C=0.05, max_iter = 3000)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 3)
vote_clf5 = XGBClassifier(max_depth = 7, seed = 2, random_state=1995,colsample_bytree=0.3, subsample=0.7)

eclf1_tfidf = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost', vote_clf5)], voting='hard')
eclf1_tfidf = eclf1_tfidf.fit(X_train, y_train)

classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4,vote_clf5, eclf1_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'VotingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
    
# Predicting new data 
predicted = eclf1_tfidf.predict(X_test)
eclf1_tfidf.score(X_test, y_test)

pred_results = []
for i in predicted:
    if i == 1:
        pred_results.append('Distribution')
    else:
        pred_results.append('Other')

# Print Classification report
print("\n")
print("Voting Classifier (TF-IDF) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.87 (+/- 0.05) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Logistic Regression]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Linear SVC]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.01) [XGBoost]
5-fold cross validated Accuracy: 0.87 (+/- 0.01) [VotingClassifier]


Voting Classifier (TF-IDF) Clasification Report
              precision    recall  f1-score   support

Distribution       0.00      0.00      0.00         9
       Other       0.80      1.00      0.89        37

    accuracy                           0.80        46
   macro avg       0.40      0.50      0.45        46
weighted avg       0.65      0.80      0.72        46



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Appending the New Categories on to the Data set
* We pick the one model that has the highest precision/f1-score to be our model 
* This model is `Stacked using logistic regression (tfidf)`

In [68]:
full['BestModelClassification'] = sclf_tfidf.predict(X_full)
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 1, value = "Distribution")
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 0, value = "Other")
print(len(full[full['BestModelClassification'] == 'Distribution']))
full['BestModelClassification'].value_counts()

44


Other           1653
Distribution      44
Name: BestModelClassification, dtype: int64

In [69]:
df = full[full['BestModelClassification'] == 'Distribution']
df = df.reset_index(drop=True)

In [70]:
df.to_csv("BestDistribution.csv", index = False)