In [1]:
from ThematicTextClassify.TextClassifier import *
from ThematicTextClassify.Preprocessing import *



In [2]:
df =pd.read_csv('Categorized_Links.csv')
# add the title and description column together to form a text document
df['Text'] = df['Title']+ df['Description']
df = df.dropna(subset= ['Text'], axis = 0)

# preprocess the newly defined Text column
# preprocess_text is a function I build and saved in TextClassifier.py

df['Processed Text'] = df['Text'].map(preprocess_text)

df['processed_string'] =  [' '.join(text) for text in df['Processed Text']]

df['Class'] = ""
df['Class'] = df.apply(lambda df: 'Utilization' if (df['Category'] == 'Utilization') else df['Class'], axis =1)
df['Class'] = df.apply(lambda df: 'Other' if (df['Category'] != 'Utilization') else df['Class'], axis =1)
df = df.reset_index(drop=True)
df = df.sort_values('Class',ascending = False)
df = df.reset_index(drop=True)
df = df.drop_duplicates(['Link'],keep= 'first')
df = df.reset_index(drop=True)
len(df)

301

In [3]:
text_train, text_test, class_train, class_test = train_test_split(df,
                                                    df['Class'],
                                                    test_size=0.15, 
                                                    random_state=19)

# Combining Classifier Predictions 
### We have already tuned the classifier with optmized parameters, we will then combine classification results on the new data and drop the duplicates
* Multinomial Naive Bayes (CountVectorizer/tfidf)
* Logistic Regression (CountVectorizer/tfidf)
* Linear SVC (CountVectorizer/tfidf)
* RandomForestClassifier(CountVectorizer/tfidf)
* XGBoost(CountVectorizer/tfidf)
##### Note: This method is not necessarily the best way since it introduces a lot of error in our classification results, but we are lenient since we want to include as much information as possible


In [4]:
# Load data to be classified 
full = pd.read_csv("NewData.csv")

# combine the title and description as text as our document
full['Text'] = full['Title'] +full['Description']
full['Processed Text'] = full['Text'].map(preprocess_text)
full['processed_string'] =  [' '.join(text) for text in full['Processed Text']]
full = full.reset_index(drop=True)

In [5]:
# first model 
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.25, min_df =1, ngram_range = (1,2)),MultinomialNB(alpha=0.25), text_train, class_train, full )
model1 = full[full['Class'] == 'Utilization']

# second model 
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.25, min_df =3, ngram_range = (1,1)),MultinomialNB(alpha=0.5), text_train, class_train, full)
model2 = full[full['Class'] == 'Utilization']


# third model 
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,1)), LogisticRegression(C=0.5, penalty = 'l2'), text_train, class_train, full)
model3 = full[full['Class'] == 'Utilization']


# fourth model
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.25, min_df =3, ngram_range = (1,2)), LogisticRegression(C=1.0, penalty = 'l1'), text_train, class_train, full)
model4 = full[full['Class'] == 'Utilization']

# fifth model
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,2)),LinearSVC(C=0.05), text_train, class_train, full)
model5 = full[full['Class'] == 'Utilization']


# sixth model
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.25, min_df =2, ngram_range = (1,1)),LinearSVC(C=0.25), text_train, class_train, full)
model6 = full[full['Class'] == 'Utilization']

# seventh model
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.25, min_df =3, ngram_range = (1,1)),RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 1), text_train, class_train, full)
model7 = full[full['Class'] == 'Utilization']

# eigth model 
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.75, min_df =3, ngram_range = (1,2)),RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 1), text_train, class_train, full)
model8 = full[full['Class'] == 'Utilization']

# ninth model
full['Class'] = feature_pipe(CountVectorizer(max_df= 0.25, min_df =2, ngram_range = (1,1)),XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.5), text_train, class_train, full)
model9 = full[full['Class'] == 'Utilization']

# tenth model 
full['Class'] = feature_pipe(TfidfVectorizer(max_df= 0.75, min_df =1, ngram_range = (1,2)),XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7), text_train, class_train, full)
model10 = full[full['Class'] == 'Utilization']




In [6]:
# Concat data sets 
frames = [model1, model2, model3, model4, model5, model6, model7, model8, model9, model10]

result_frame = pd.concat(frames)
print("Length of classified data set before dropping duplicates: ", len(result_frame))
result_frame = result_frame.drop_duplicates(['Link'],keep= 'last')
print("Length of classified data set after dropping duplicates: ", len(result_frame))

Length of classified data set before dropping duplicates:  301
Length of classified data set after dropping duplicates:  86


In [7]:
utilizationcsv = result_frame[['Title', 'Description', 'Link','Class']]
utilizationcsv = utilizationcsv.reset_index(drop=True)
utilizationcsv.to_csv('utilization.csv', index = False)

In [8]:
utilizationcsv.tail()

Unnamed: 0,Title,Description,Link,Class
81,Combining nutrient intake from food/beverages ...,This article describes methods for combining ...,https://www150.statcan.gc.ca/n1/pub/82-003-x/2...,Utilization
82,Impact of identifying plausible respondents on...,"A 24-hour dietary recall from 16,190 responde...",https://www150.statcan.gc.ca/n1/pub/82-003-x/2...,Utilization
83,Impact of number of repeat 24 hour recall inte...,National Food and Nutrition Surveys provide c...,https://www150.statcan.gc.ca/n1/pub/11-522-x/2...,Utilization
84,"Nutrient intakes from food, 2015 Archived",This is a health fact sheet about the nutrien...,https://www150.statcan.gc.ca/n1/pub/82-625-x/2...,Utilization
85,"Protein sources in the Canadian diet, 2015",This infographic presents results from the 20...,https://www150.statcan.gc.ca/n1/pub/11-627-m/1...,Utilization


# Setting Up Training Data, Holdout, and New data to be Classified (CountVectorizer)
* Note: It is not likely to have individual vectorizer for each model. Therefore we would use countvectorizer with the most popular tuning parameters given from the best models above (optmized using GridSearchCv)

In [19]:
# Training Data 
countvect = CountVectorizer(max_df= 0.5, min_df =2, ngram_range = (1,2))
X_train = countvect.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Utilization", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values

X_test = countvect.transform(text_test['processed_string'])
X_test = X_test.toarray()

y_test = class_test.replace(to_replace = "Utilization", value = 1)
y_test  = y_test.replace(to_replace = "Other", value = 0)
y_test  = y_test.values

X_full = countvect.transform(full['processed_string'])
X_full = X_full.toarray()

# Stacked Classifier (CountVectorizer)
## Using Logistic Regression as meta_classifier

In [20]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=0.5, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 1)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.5)

meta_clf = LogisticRegression(C=1.0, penalty = 'l2')
sclf_log = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_log]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_log = sclf_log.fit(X_train,y_train)
prediction_results_int = sclf_log.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Utilization')
    else:
        prediction_results.append('Other')
    
# Print Classification report   
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (Logistic Regression Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.92 (+/- 0.04) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [Logistic Regression]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.89 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [XGBoost]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (Logistic Regression Meta Classifier)
              precision    recall  f1-score   support

       Other       0.89      0.97      0.93        34
 Utilization       0.89      0.67      0.76        12

    accuracy                           0.89        46
   macro avg       0.89      0.82      0.85        46
weighted avg       0.89      0.89      0.89        46



# Stacked Classifier (CountVectorizer)
## Using XGBoost as the Meta Classifier

In [26]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=0.5, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 1)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.5)

meta_clf = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.5)

sclf_XGB = StackingClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4,stack_clf5, sclf_XGB]

classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_XGB = sclf_XGB.fit(X_train,y_train)
prediction_results_int = sclf_XGB.predict(X_test)
prediction_results = []


for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Utilization')
    else:
        prediction_results.append('Other')

        
# Print Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.92 (+/- 0.04) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [Logistic Regression]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.89 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [XGBoost]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

       Other       0.92      0.97      0.94        34
 Utilization       0.90      0.75      0.82        12

    accuracy                           0.91        46
   macro avg       0.91      0.86      0.88        46
weighted avg       0.91      0.91      0.91        46



# Voting Classifier (CountVectorizer)

In [11]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.25)
stack_clf2 = LogisticRegression(C=0.5, penalty = 'l2')
stack_clf3 = SVC(kernel='linear', C= 0.05, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 200, random_state = 1)
stack_clf5 = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.5)

meta_clf = XGBClassifier(max_depth = 4, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.5)

sclf_XGB = StackingCVClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4,stack_clf5, sclf_XGB]

classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_XGB = sclf_XGB.fit(X_train,y_train)
prediction_results_int = sclf_XGB.predict(X_test)
prediction_results = []


for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Utilization')
    else:
        prediction_results.append('Other')

        
# Print Classification report
print("\n")
print("Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.92 (+/- 0.04) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [Logistic Regression]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.89 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [XGBoost]
5-fold cross validated Accuracy: 0.95 (+/- 0.02) [StackingClassifier]


Stacked Classifier (CountVect) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

       Other       0.89      0.94      0.91        34
 Utilization       0.80      0.67      0.73        12

    accuracy                           0.87        46
   macro avg       0.84      0.80      0.82        46
weighted avg       0.87      0.87      0.87        46



# Setting Up Data (tfidfVectorizer)

In [13]:
# Training Data 
tfidf = TfidfVectorizer(max_df= 0.5, min_df =3, ngram_range = (1,2))
X_train = tfidf.fit_transform(text_train['processed_string'])
X_train = X_train.toarray()
y_train = class_train.replace(to_replace = "Utilization", value = 1)
y_train = y_train.replace(to_replace = "Other", value = 0)
y_train = y_train.values

X_test = tfidf.transform(text_test['processed_string'])
X_test = X_test.toarray()

y_test = class_test.replace(to_replace = "Utilization", value = 1)
y_test  = y_test.replace(to_replace = "Other", value = 0)
y_test  = y_test.values

X_full = tfidf.transform(full['processed_string'])
X_full = X_full.toarray()

# Stacked Classifier (tfidfVectorizer)
## Using Logistic Regression as meta_classifier

In [14]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.5)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l1')
stack_clf3 = SVC(kernel='linear', C= 0.25, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 1)
stack_clf5 = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = LogisticRegression(C=0.5, penalty = 'l2')
sclf_tfidf = StackingClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier', 'XGBoost', 'StackingClassifier']


# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_tfidf = sclf_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_tfidf.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Utilization')
    else:
        prediction_results.append('Other')
        
# Print Classification report
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.95 (+/- 0.02) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.01) [Logistic Regression]
5-fold cross validated Accuracy: 0.88 (+/- 0.01) [Linear SVC]
5-fold cross validated Accuracy: 0.90 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [XGBoost]
5-fold cross validated Accuracy: 0.93 (+/- 0.04) [StackingClassifier]


Stacked Classifier (TF-IDF) Clasification Report (Logistic Regression as meta classifier)
              precision    recall  f1-score   support

       Other       0.89      1.00      0.94        34
 Utilization       1.00      0.67      0.80        12

    accuracy                           0.91        46
   macro avg       0.95      0.83      0.87        46
weighted avg       0.92      0.91      0.91        46



# Stacked Classifier (tfidfVectorizer)
## Using XGBoost n as meta_classifier

In [20]:
from ThematicTextClassify.EnsembleClassifiers import *
from xgboost import XGBClassifier

stack_clf1 = MultinomialNB(alpha=0.5)
stack_clf2 = LogisticRegression(C=1.0, penalty = 'l1')
stack_clf3 = SVC(kernel='linear', C= 0.25, probability=True)
stack_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 1)
stack_clf5 = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

meta_clf = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

sclf_XGB_tfidf = StackingClassifier(classifiers=[stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_clf)
classifiers = [stack_clf1, stack_clf2, stack_clf3, stack_clf4, stack_clf5, sclf_XGB_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'StackingClassifier']


# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)

# Predicting new data 
sclf_XGB_tfidf = sclf_XGB_tfidf.fit(X_train,y_train)
prediction_results_int = sclf_XGB_tfidf.predict(X_test)
prediction_results = []

for i in prediction_results_int:
    if i == 1:
        prediction_results.append('Utilization')
    else:
        prediction_results.append('Other')
        
# Print Classification report    
print("\n")
print("Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)")
print(classification_report(class_test.tolist(), prediction_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.95 (+/- 0.02) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.01) [Logistic Regression]
5-fold cross validated Accuracy: 0.88 (+/- 0.01) [Linear SVC]
5-fold cross validated Accuracy: 0.90 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [XGBoost]
5-fold cross validated Accuracy: 0.95 (+/- 0.03) [StackingClassifier]


Stacked Classifier (TF-IDF) Clasification Report (XGBoost as Meta Classifier)
              precision    recall  f1-score   support

       Other       0.92      0.97      0.94        34
 Utilization       0.90      0.75      0.82        12

    accuracy                           0.91        46
   macro avg       0.91      0.86      0.88        46
weighted avg       0.91      0.91      0.91        46



# Voting Classifier (tfidfVectorizer)

In [16]:
vote_clf1 = MultinomialNB(alpha=0.5)
vote_clf2 = LogisticRegression(C=1.0, penalty = 'l1')
vote_clf3 = LinearSVC(C=0.25, max_iter = 3000)
vote_clf4 = RandomForestClassifier(max_depth =4, n_estimators = 300, random_state = 1)
vote_clf5 = XGBClassifier(max_depth = 5, seed = 1, random_state=1995,colsample_bytree=0.3, subsample=0.7)

eclf1_tfidf = VotingClassifier(estimators=[('Multinomial Naive Bayes', vote_clf1), ('Logistic Regression Classifier', vote_clf2), ('LinearSVC', vote_clf3), ('RandomForestClassifier', vote_clf4), ('XGBoost',vote_clf5)], voting='hard')
eclf1_tfidf = eclf1_tfidf.fit(X_train, y_train)

classifiers = [vote_clf1, vote_clf2, vote_clf3, vote_clf4, vote_clf5, eclf1_tfidf]
classifier_names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC', 'Random Forest Classifier','XGBoost', 'VotingClassifier']

# Acuracy function (5-fold Cross validated)
scoring(classifiers, classifier_names, X_train, y_train)
    
# Predicting new data 
predicted = eclf1_tfidf.predict(X_test)
eclf1_tfidf.score(X_test, y_test)

pred_results = []
for i in predicted:
    if i == 1:
        pred_results.append('Utilization')
    else:
        pred_results.append('Other')

# Print Classification report
print("\n")
print("Voting Classifier (TF-IDF) Clasification Report")
print(classification_report(class_test.tolist(),pred_results))

5-fold cross validation:

5-fold cross validated Accuracy: 0.95 (+/- 0.02) [Multinomial Naive Bayes]
5-fold cross validated Accuracy: 0.89 (+/- 0.01) [Logistic Regression]
5-fold cross validated Accuracy: 0.93 (+/- 0.03) [Linear SVC]
5-fold cross validated Accuracy: 0.90 (+/- 0.01) [Random Forest Classifier]
5-fold cross validated Accuracy: 0.91 (+/- 0.05) [XGBoost]
5-fold cross validated Accuracy: 0.93 (+/- 0.04) [VotingClassifier]


Voting Classifier (TF-IDF) Clasification Report
              precision    recall  f1-score   support

       Other       0.85      1.00      0.92        34
 Utilization       1.00      0.50      0.67        12

    accuracy                           0.87        46
   macro avg       0.93      0.75      0.79        46
weighted avg       0.89      0.87      0.85        46



# Appending the New Categories on to the Data set
* We pick the one model that has the highest precision/f1-score to be our model 
* Best model: `Stacked Classifier (Countvect) XGBoost as Meta Classifier`

In [27]:
full['BestModelClassification'] = sclf_XGB.predict(X_full)
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 1, value = "Utilization")
full['BestModelClassification'] = full['BestModelClassification'] .replace(to_replace = 0, value = "Other")
print(len(full[full['BestModelClassification'] == 'Utilization']))
full['BestModelClassification'] .value_counts()

59


Other          1638
Utilization      59
Name: BestModelClassification, dtype: int64

In [28]:
df = full[full['BestModelClassification'] == 'Utilization']
df = df.reset_index(drop=True)
df.to_csv("BestUtilization.csv", index = False)