In [1]:
import pandas as pd
import chardet
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
import pickle
import matplotlib as plt
import shutil
import os

In [2]:
#read dataset
with open('AllQualBaseThemes.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large
df = pd.read_csv("AllQualBaseThemes.csv", encoding=result['encoding'])
df = df.dropna()
df['Marked_Passage']=df['Marked_Passage'].apply(str)
df['Category_Title']=df['Category_Title'].apply(str)
df['text_desc'] = df['Marked_Passage']
# Change the sentiment column into 1s and 0s. 1 for positive (P), 0 for negative (N)
df.loc[df['Sentiment']=='P', 'Sentiment'] = 1
df.loc[df['Sentiment'] == 'N', 'Sentiment'] = 0
# Usually takes between 6-8 seconds to run this chunk
df.sample(10)

Unnamed: 0,Course,Sentiment,Learning Activities,Learning Technology,Course Structure,Communication,Learning Materials,Learning Objectives,Community,General,Instructor/TAs,Category_Title,Marked_Passage,text_desc
957,COMMS 426,0,0,0,1,0,1,0,0,0,0,Learning Materials,It can be a little dense having to do both th...,It can be a little dense having to do both th...
2086,IP&T 371,0,0,0,0,0,1,0,0,0,0,Course Structure,All the information I needed to be successful ...,All the information I needed to be successful ...
3665,HLTH 335,0,0,0,0,1,0,0,0,0,0,Communication,I think some of the directions towards the end...,I think some of the directions towards the end...
1700,Psych 275,0,0,0,1,1,0,0,0,0,0,Course Structure,There has been a lot of confusion about when ...,There has been a lot of confusion about when ...
531,NURS 491,0,0,0,1,0,1,0,0,0,0,Learning Materials,I have felt like there is more reading than i...,I have felt like there is more reading than i...
330,WRTG 150,0,0,0,0,1,0,0,0,0,1,Communication,I barely got any direction from my professor\...,I barely got any direction from my professor\...
4564,RELA 275,0,1,0,0,0,0,0,0,0,0,Learning Activities,Less writing,Less writing
5766,PSYCH 358,1,0,0,1,1,0,0,0,0,0,Course Structure,The lesson plans and routines are clearly defi...,The lesson plans and routines are clearly defi...
2219,PSYCH 338,0,0,0,0,0,1,0,0,0,0,Learning Materials,Being able to have a write up of the lectures ...,Being able to have a write up of the lectures ...
6517,CHEM 101,0,0,0,0,0,0,0,0,1,0,General,This course needs a total overhaul,This course needs a total overhaul


In [17]:
def extract_features(df_x, type, to_pickle = True):
    if type == 'tfidf':
        transformer = TfidfVectorizer()
        bag = transformer.fit_transform(df_x)
        if to_pickle == True:
            transformer_path = open(f'{directory}/tfidf_transformer.pkl', 'wb')
            pickle.dump(transformer, transformer_path)
            print('Tfidf bag of words successfully pickled into tfidf_transformer.pkl')
    elif type == 'count_vect':
        transformer = CountVectorizer()
        bag = transformer.fit_transform(df_x)
        if to_pickle == True:
            transformer_path = open(f'{directory}/cv_transformer.pkl', 'wb')
            pickle.dump(transformer, transformer_path)
            print('Count vectorizer bag of words successfully pickled into cv_transformer.pkl')
    X = bag.toarray()
    return X

def model_trainer2(label, x_column):
    # label is the column name of df that corresponds to a category. Should be in string format.
    # x_features must be in bag of words format, and is prepared in the extract_features function defined above
    # df_y is the labeled data that is used to actually train the model. In this case, it is 1s and 0s in int form.
    # It should not have been split yet.
    df_y = df[label]
    df_y = df_y.astype(int)
    # Split data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(x_column, df_y, random_state = random_seed, test_size = 0.2)
    # X_train is the transformed version of the comments that will be used to train the model. 
    # X_test is not used to train the model, but the model is applied to it for initial accuracy scores.
    # y_train is the split version that is compared to the model
    tfidf = TfidfVectorizer()
    cv = CountVectorizer()
    tfidf.fit_transform(X_train)
    cv.fit_transform(X_train)

    tf_bag = tfidf.transform(X_train)
    cv_bag = cv.transform(X_train)
    tf_test = tfidf.transform(X_test)
    cv_test = cv.transform(X_test)
    # (TO DO) Maybe tweak the hyperparameters of this model?
    log_reg = LogisticRegression(C = 10, random_state = 27, solver = 'lbfgs', multi_class = 'ovr', max_iter=2000)
    tf_model = log_reg.fit(tf_bag, y_train)
    cv_model = log_reg.fit(cv_bag, y_train)

    # Transform the testing data for accurate predictions
    # Make predictions
    tf_predict = log_reg.predict(tf_test)
    cv_predict = log_reg.predict(cv_test)

    tf_accuracy = metrics.accuracy_score(y_test, tf_predict)
    cv_accuracy = metrics.accuracy_score(y_test, cv_predict)
    print(f'Tfidf accuracy score for {label}: {round(tf_accuracy, 4)}')
    print(f'Count Vectorizor accuracy score for {label}: {round(cv_accuracy, 4)}')

    if tf_accuracy >= cv_accuracy:
        model = tf_model
        X_test = tf_test
    else:
        model = cv_model
        X_test = cv_test

    # y_predict = log_reg.predict(X_test)
    # print(f'Comments that are being used for prediction: {X_test[:5]}')
    # print(f'First 5 predictions: {y_predict[:5]}')
    # tf_accuracy = metrics.accuracy_score(y_test, y_predict)
    # print(f'{label} Accuracy Score: {round(metrics.accuracy_score(y_test, y_predict), 8)}')
    # return the necessary elements that the pickle code will need to store the model
    # also return the accuracy score to be compared against other extraction methods
    return model, X_test


def model_trainer(label, x_features):
    # A simpler form of model_trainer2. Unlike model_trainer2, the xfeatures should have already been split and transformed.
    # This prevents the same code from running multiple times when it doesn't need to be
    df_y = TRAIN[label]
    df_y = df_y.astype(int)
    
    logreg = LogisticRegression(C = 10, random_state = 11, solver = 'lbfgs', multi_class = 'ovr', max_iter = 2000)
    model = logreg.fit(x_features, df_y)
    # TODO Test whether the initial accuracy metric being included here screws things up.
    # TODO Maybe make a separate function that finds accuracy??

    return model

def check_accuracy(model, transformer, category):
    y_test = TEST['text_desc']
    y_preds = model.predict()
    score = metrics.accuracy_score(y_test, y_preds)
    print(f'Accuracy score for {category}: {round(score, 4)}')
    # TODO finish this function

def get_top_k_predictions(model, X_test,k):
    # get probabilities instead of predicted labels, since we want to collect top k
    np.set_printoptions(suppress=True)
    probs = model.predict_proba(X_test)

    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]
    
    # GET CATEGORY OF PREDICTIONS
    preds = [
    [(model.classes_[predicted_cat], distribution[predicted_cat])
     for predicted_cat in prediction]
    for distribution, prediction in zip(probs, best_n)]
    
    preds=[ item[::-1] for item in preds]
    return preds

def pickler(model, category):
    # model_path = open(f'{category} model.pkl', 'wb')
    model_path = open(f'{directory}/{category} model.pkl', 'wb')
    # we need to save both the transformer -> to encode a document and the model itself to make predictions based on the weight vectors 
    pickle.dump(model, model_path)

In [18]:
# Create a list of all categories, with a model to be run surrounding each item
categories = ['Learning Activities', 'Learning Technology', 'Course Structure', 'Communication',
 'Learning Materials', 'Learning Objectives', 'Community', 'General', 'Instructor/TAs']

# A chance to change up the randomness of the testing and training data
random_seed = 16

# Make a new directory for the pickle files
directory = os.path.join(os.getcwd(), '.pklfiles')
# Clear out and delete the folder of .pklfiles if it exists
if os.path.exists(directory):
    shutil.rmtree(directory)
os.mkdir(directory)

# EDA and plotting (TODO finish this later?)
ones = {}
print(f'Number of rows in df: {df.shape[0]}')
for item in categories:
    counter = 0
    for i in df[item]:
        if i == 1:
            counter += 1
    proportion = (counter / df.shape[0]) * 100
    print(f'{item}: {counter} ({round(proportion, 2)}%)')   
    ones[item] = counter

# keys = list(ones.keys())
# vals = list(ones.values())

# plt.bar(range(len(ones)), vals)

Number of rows in df: 6812
Learning Activities: 1846 (27.1%)
Learning Technology: 307 (4.51%)
Course Structure: 1966 (28.86%)
Communication: 893 (13.11%)
Learning Materials: 1639 (24.06%)
Learning Objectives: 557 (8.18%)
Community: 393 (5.77%)
General: 306 (4.49%)
Instructor/TAs: 569 (8.35%)


In [19]:
# Create a dictionary that will be incrementally added to and that will become the excel spreadsheet
results = {
    'Text': [],
    'Sentiment': [],
    'Learning Activities': [],
    'Learning Technology': [], 
    'Course Structure': [],
    'Communication': [],
    'Learning Materials': [],
    'Learning Objectives': [],
    'Community': [],
    'General': [],
    'Instructor/TAs': [],
    'Sentiment Reality': [],
    'Activities Reality': [],
    'Technology Reality': [],
    'Structure Reality': [],
    'Communication Reality': [],
    'Materials Reality': [],
    'Objectives Reality': [],
    'Community Reality': [],
    'General Reality': [],
    'Instructors Reality': []
}
# These will be the column names in the output spreadsheet that will be generated at the end of the program
data_columns = ['Text', 'Sentiment', 'Learning Activities', 'Learning Technology', 'Course Structure', 
'Communication', 'Learning Materials', 'Learning Objectives', 'Community','General', 
'Instructor/TAs', 'Sentiment Reality', 'Activities Reality', 'Technology Reality', 'Structure Reality', 
'Communication Reality', 'Materials Reality', 'Objectives Reality', 'Community Reality',
'General Reality', 'Instructors Reality']

# Split the data into training and testing data frames (All y columns included)
TRAIN, TEST = train_test_split(df, test_size = 0.2, random_state = random_seed)
print(TRAIN.shape)

# Fill out half of the output spreadsheet with the actual reference values. The rest will be filled with predictions on the same comments. Will be good for comparison.
for i in range(len(TEST)):
    results['Text'].append(TEST.text_desc.iloc[i])
    results['Activities Reality'].append(TEST['Learning Activities'].iloc[i])
    results['Technology Reality'].append(TEST['Learning Technology'].iloc[i])
    results['Structure Reality'].append(TEST['Course Structure'].iloc[i])
    results['Communication Reality'].append(TEST['Communication'].iloc[i])
    results['Materials Reality'].append(TEST['Learning Materials'].iloc[i])
    results['Objectives Reality'].append(TEST['Learning Objectives'].iloc[i])
    results['Community Reality'].append(TEST.Community.iloc[i])
    results['General Reality'].append(TEST.General.iloc[i])
    results['Sentiment Reality'].append(TEST.Sentiment.iloc[i])
    results['Instructors Reality'].append(TEST['Instructor/TAs'].iloc[i])

# Create a bag of words in two different ways and create a pickle file for each method

# tfidf = extract_features(df_x = df['text_desc'], type = 'tfidf', to_pickle = True)
# count_vect = extract_features(df_x = df['text_desc'], type = 'count_vect', to_pickle = True)
pickle_directory = os.path.join(os.getcwd(), '.pklfiles')
tfidf = TfidfVectorizer()
cv = CountVectorizer()
tfidf_trainX = tfidf.fit_transform(TRAIN['text_desc'])
cv_trainX = cv.fit_transform(TRAIN['text_desc'])
print(cv_trainX.shape)
pickle.dump(tfidf, open(f'{pickle_directory}/tfidf_transformer.pkl', 'wb'))
pickle.dump(cv, open(f'{pickle_directory}/cv_transformer.pkl', 'wb'))

# tf_bag = tfidf.transform(X_train)
# cv_bag = cv.transform(X_train)
tf_test = tfidf.transform(TEST['text_desc'])
cv_test = cv.transform(TEST['text_desc'])



logreg = LogisticRegression(C = 10, random_state = 11, solver = 'lbfgs', multi_class = 'ovr', max_iter = 2000)
model = logreg.fit(X = tfidf_trainX, y = TRAIN['Learning Activities'])



(5449, 14)
(5449, 3604)


In [None]:
# Go through all the results for Sentiment to highlight the process

# First, extract features from the comments by making a bag of words. There are two main ways to do this, so we will prepare this for both. 
# Note that later on, you won't actually have to repeat the process for every new category / model. These functions will be called again below because this cell is optional.
# x_tfidf = extract_features(df_x = df.Marked_Passage, type = 'tfidf', to_pickle = False)
# x_cv = extract_features(df_x = df.Marked_Passage, type = 'count_vect', to_pickle = False)

'''Once the features have been extracted, it is time to actually train the model
The model_trainer function creates a logistic regression between the 0s and 1s in your category column (label) and the array of words that you just extracted (x_features)
model_trainer also makes predictions with the newly created model and calculates an overall accuracy score by comparing predictions with testing data
We run model_trainer twice with each category to see which feature extraction works better. model_trainer will print the accuracy score at the end and the code will compare them.'''

# PvN_model_tf, accuracy_tf, PvN_X_test = model_trainer(label = 'Sentiment', x_features = x_tfidf)
# PvN_Model_cv, accuracy_cv, PvN_X_test_cv = model_trainer(label = 'Sentiment', x_features = x_cv)
sentiment_model, sentiment_x = model_trainer(label = 'Sentiment', x_column = df['text_desc'])

# Compare the two accuracy scores. They are usually close, but they vary in terms of which one actually performs better
# if accuracy_tf >= accuracy_cv:
#     print('For sentiment analysis, the Tfidf vectorizer produced more accurate results.')
#     # This creates a pickle file, which stores a python object (the model itself) in a file that can be accessed in other scripts andfiles.
#     pickler(model = PvN_model_tf, category = 'Sentiment Analysis')
#     # Make probability predictions as well
#     predictions = get_top_k_predictions(PvN_model_tf, X_test = PvN_X_test, k = 2)
# else:
#     print('For Sentiment Analysis, the Count Vectorizer produced more accurate results.')
#     pickler(model = PvN_Model_cv, category = 'Sentiment Analysis')
#     predictions = get_top_k_predictions(PvN_Model_cv, X_test = PvN_X_test_cv, k = 2)
sentiment_pickle_path = f'{directory}/Sentiment model.pkl'
pickle.dump(sentiment_model, open(sentiment_pickle_path, 'wb'))

predictions = get_top_k_predictions(model = sentiment_model, X_test = sentiment_x, k = 2)
for i in range(len(predictions)):
    if predictions[i][0][0] == 1:
        results['Sentiment'].append(format(predictions[i][0][1], '.5f'))
    else:
        results['Sentiment'].append(format(predictions[i][1][1], '.5f'))
# Show a few predictions and see how confident they are
for i in range(10):
    print(x_test.iloc[i])
    print(predictions[i])


In [20]:
for item in categories:
    # model, test = model_trainer(label = item, x_column = df['text_desc'])
    model = model_trainer(label = item, x_features = cv_trainX)
    if item == 'Instructor/TAs':
        category = 'Instructors'
    else:
        category = item
    pickle_path = f'{directory}/{category} model.pkl'
    pickle.dump(model, open(pickle_path, 'wb'))
    preds = get_top_k_predictions(model = model, X_test = cv_test, k = 2)
    for i in range(len(preds)):
        if preds[i][0][0] == 1:
            results[item].append(format(preds[i][0][1], '.5f'))
        else:
            results[item].append(format(preds[i][1][1], '.5f'))


(5449,)
(5449,)
(5449,)
(5449,)
(5449,)
(5449,)
(5449,)
(5449,)
(5449,)


In [None]:
# home = os.getcwd()
# sentiment_path = os.path.join(home, '.pklfiles/Sentiment Analysis model.pkl')
# loaded_model = pickle.load(open(sentiment_path, 'rb'))
# loaded_preds = get_top_k_predictions(model = loaded_model, X_test = sentiment_x, k = 2)
# for i in range(10):
#     print(x_test_text.iloc[i])
#     print(loaded_preds[i])
for val in results.keys():
    print(val + '\t' + str(len(results[val])))


In [None]:
# Take all the information and make a database that can be exported as an excel sheet
Data = pd.DataFrame(results, columns = data_columns)
Data.head(10)

In [None]:
# Convert the pandas dataframe to an excel file
file_name = 'ML sample output.csv'
Data.to_csv(file_name, index = False)

In [None]:
home = os.getcwd()
model_filepath = f'.pklfiles/Sentiment model.pkl'
category_model_file = os.path.join(home, model_filepath)
category_loaded_model = pickle.load(open(category_model_file, 'rb'))
extraction_path = os.path.join(home, '.pklfiles/cv_transformer.pkl')
comment = ['I feel like there is a lot of busywork sometimes']
transformed_comment = cv.transform(comment)
pred = get_top_k_predictions(category_loaded_model, transformed_comment, 2) # This line works, showing that the non-pickled transformer works with a pickled model
print(pred) 
category_loaded_transformer = pickle.load(open(extraction_path, 'rb'))
category_test_features = category_loaded_transformer.transform(comment)
prediction = get_top_k_predictions(category_loaded_model, category_test_features, 2)
# This doesn't work, showing that the pickled transformer does not work here
