### Upsampling and Downsampling Data

In [7]:
def upsample_training_data(X_train, y_train):
    
    '''
    draws samples from the minoriy class (label = 1) until it is the size of the minority class (label = 0)

    returns a single upsampled dataframe (with both X_train and y_train in one dataframe)

    '''
    training_data = X_train.copy()
    training_data['label']= y_train

    train_0 = training_data[training_data.label==0]
    train_1 = training_data[training_data.label==1]

    train_1_up = resample(train_1, 
          replace=True,    
          n_samples=len(train_0),   
          random_state=10)

    train_upsampled = pd.concat([train_1_up, train_0])
    
    return train_upsampled

In [630]:
def downsample_training_data(X_train, y_train):
    
    '''
    draws samples from the majority class (label = 0) until it is the size of the minority class (label = 1)

    returns a single downsampled dataframe (with both X_train and y_train in one dataframe)

    '''
    
    training_data = X_train.copy()
    training_data['label']= y_train
    
    train_0 = training_data[training_data.label==0]
    train_1 = training_data[training_data.label==1]

    train_0_down = resample(train_0, 
          replace=True,    
          n_samples=len(train_1),   
          random_state=10)

    train_downsampled = pd.concat([train_0_down, train_1])
    
    return train_downsampled

## Functions to Test Classifiers:

In [13]:
def NB_compare_vectorization_model(X_train_col, y_train, X_val_col, y_val, classifier):
    
    '''
    Adapts the 'compare vectorization model' function to work specifically for Naive Bayes by transforming the
    sparse vector matrices to dense numpy arrays

    No explicit correction for class imbalance is made in this function, 
    but up or downsampled data can be passed in as arguments.

    Vectorization methods should be specified outside the function in a 'vectorization list',
    which is a list of tuples specifying each name and vectorization method to be used.

    Parameters:
    
    X_train_col: cleaned text column in training set
    
    y_train: target variable in training set
    
    X_val_col: cleaned text column in validation set
    
    y_val: target variable in validation set 
    
    classifier: name of classifier; uses default parameters if none are specified

    '''    
 
    metrics_dict = {}

    for name, vectorizer in vectorization_list:
    
        X_train_transformed = vectorizer.fit_transform(X_train_col)
        X_val_transformed = vectorizer.transform (X_val_col)
        
        X_train_transformed = X_train_transformed.toarray()
        X_val_transformed = X_val_transformed.toarray()
        
        classifier.fit(X_train_transformed, y_train.values.ravel())
    
        train_predictions = classifier.predict (X_train_transformed)
        val_predictions = classifier.predict (X_val_transformed)   
    
    #       print scores  
        print ('The performance of the {} is:'.format((name)))
        print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
        print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
        print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
        print('Train F1: ' + str(round(metrics.f1_score(y_train,train_predictions),2)))
        print('\n')
        print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
        print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
        print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
        print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))
        print('\n')
        
   
        metrics_dict[name] = {
        'Train Accuracy' : metrics.accuracy_score(y_train, train_predictions),
        'Train Precision' : metrics.precision_score(y_train, train_predictions),
        'Train Recall' : metrics.recall_score(y_train, train_predictions),
        'Train F1': metrics.f1_score(y_train,train_predictions),
        
        'Validation Accuracy': metrics.accuracy_score(y_val, val_predictions),
        'Validation Precision' : metrics.precision_score(y_val, val_predictions),
        'Validation Recall': metrics.recall_score(y_val, val_predictions),
        'Validation F1': metrics.f1_score(y_val, val_predictions)
        }
        
    return metrics_dict

In [7]:
def single_vector_model(X_train_col, y_train, X_val_col, y_val, vectorizer, classifier):
    
    '''
    Apply the specified text vectorizer,make predictions and calculate scores with specified classifier.

    No explicit correction for class imbalances is conducted in this function, 
    but up or downsampled X_train and y_train variables can be passed as arguments

    Parameters:

    X_train_col: specify cleaned text column to be used for vectorization and predictions

    y_train: enter as a one-dimensional vector; function transforms into an array

    X_val_col: specify cleaned text column to be used for vectorization and predictions

    y_val:  enter as a one-dimensional vector; function transforms into an array

    vectorizer: indicate text vectorization method; uses default parameters if none are specified

    classifier: name of classifier; uses default parameters if none are specified

    '''
    
    X_train_transformed = vectorizer.fit_transform(X_train_col)
    X_val_transformed = vectorizer.transform(X_val_col)
            
    classifier.fit(X_train_transformed, y_train.values.ravel())
    
    train_predictions = classifier.predict(X_train_transformed)
    val_predictions = classifier.predict (X_val_transformed)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [33]:
def SMOTE_vector_model(X_train_col, y_train, X_val_col, y_val, vectorizer, classifier):
    '''
    Apply the specified text vectorizer, use SMOTE to rebalance class sizes, 
    and then make predictions and calculate scores for specified classifier

    Parameters:

    X_train_col: specify cleaned text column to be used for vectorization and predictions

    y_train: enter as a one-dimensional vector; function transforms into an array

    X_val_col: specify cleaned text column to be used for vectorization and predictions

    y_val_col:  enter as a one-dimensional vector; function transforms into an array

    vectorizer: indicate text vectorization method; uses default parameters if none are specified

    classifier: name of classifier; uses default parameters if none are specified

    '''
    
    X_train_transformed = vectorizer.fit_transform(X_train_col)
    X_val_transformed = vectorizer.transform(X_val_col)
    
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
    
    pipe = make_pipeline(smote, classifier) 
    
    model = pipe.fit(X_train_transformed, y_train.values.ravel())
    
    train_predictions = model.predict(X_train_transformed)
    val_predictions = model.predict (X_val_transformed)
    
    # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    return log_confusion_test

In [32]:
def compare_vectorization_model(X_train_col, y_train, X_val_col, y_val, classifier):
    '''
    Compares classification model performance using different text vectorizers,
    (declared in 'vectorization list') outside the function.
        
    Parameters:
    
    X_train_col: cleaned text column in training set
    
    y_train: target variable in training set
    
    X_val_col: cleaned text column in validation set
    
    y_val: target variable in validation set 
    
    classifier: name of classifier; uses default parameters if none are specified
    
    '''
        
    metrics_dict = {}
        
    for name, vectorizer in vectorization_list:
                
        X_train_transformed = vectorizer.fit_transform(X_train_col)
        X_val_transformed = vectorizer.transform (X_val_col)

        classifier.fit(X_train_transformed, y_train.values.ravel())
    
        train_predictions = classifier.predict (X_train_transformed)
        val_predictions = classifier.predict (X_val_transformed)   
    
        metrics_dict[name] = {
        'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
        'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
        'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
        'Train F1': round(metrics.f1_score(y_train,train_predictions),2),
        
        'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
        'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
        'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
        'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)
        }
        
    return metrics_dict

In [16]:
def SMOTE_compare_vectorization_model(X_train_col, y_train, X_val_col, y_val, classifier):
  
    '''
    Compares the performance of a single classifier using different text vectorization methods.

    Uses SMOTE to rebalance class sizes before predictions are made and scored.

    Vectorization methods should be specified outside the function in a 'vectorization list',
    which is a list of tuples specifying each name and vectorization method to be used.

    Parameters:
    
    X_train_col: cleaned text column in training set
    
    y_train: target variable in training set
    
    X_val_col: cleaned text column in validation set
    
    y_val: target variable in validation set 
    
    classifier: name of classifier; uses default parameters if none are specified

    '''   
    metrics_dict = {}
        
    for name, vectorizer in vectorization_list:
              
        X_train_transformed = vectorizer.fit_transform(X_train_col)
        X_val_transformed = vectorizer.transform (X_val_col)
        
        smote = SMOTE(random_state=1, sampling_strategy='not majority')
    
        pipe = make_pipeline(smote, classifier) 
    
        model = pipe.fit(X_train_transformed, y_train.values.ravel())
    
        train_predictions = model.predict(X_train_transformed)
        val_predictions = model.predict (X_val_transformed)
    
        metrics_dict[name] = {
        'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
        'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
        'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
        'Train F1': round(metrics.f1_score(y_train,train_predictions),2),
        
        'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
        'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
        'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
        'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)
        }
        
    return metrics_dict

In [12]:
def SMOTE_compare_vectorization_model2(X_train_col, y_train, X_val_col, y_val, classifier, smote = False):
  
    '''
    Compares the performance of a single classifier using different text vectorization methods.

    Uses SMOTE to rebalance class sizes before predictions are made and scored.

    Vectorization methods should be specified outside the function in a 'vectorization list',
    which is a list of tuples specifying each name and vectorization method to be used.

    Parameters:
    
    X_train_col: cleaned text column in training set
    y_train: target variable in training set    
    X_val_col: cleaned text column in validation set
    y_val: target variable in validation set 
    classifier: name of classifier; uses default parameters if none are specified

    '''   
    metrics_dict = {}
    
    for name, vectorizer in vectorization_list:

        if smote == True:
                      
            X_train_transformed = vectorizer.fit_transform(X_train_col)
            X_val_transformed = vectorizer.transform (X_val_col)
        
            smote = SMOTE(random_state = 1, sampling_strategy = 'not majority')
    
            pipe = make_pipeline(smote, classifier) 
    
            model = pipe.fit(X_train_transformed, y_train.values.ravel())
    
            train_predictions = model.predict(X_train_transformed)
            val_predictions = model.predict (X_val_transformed)
            
            metrics_dict[name] = {
            'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
            'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
            'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
            'Train F1': round(metrics.f1_score(y_train,train_predictions),2),
        
            'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
            'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
            'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
            'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)
            }
                            
        else:
                                
            X_train_transformed = vectorizer.fit_transform(X_train_col)
            X_val_transformed = vectorizer.transform (X_val_col)

            classifier.fit(X_train_transformed, y_train.values.ravel())
    
            train_predictions = classifier.predict (X_train_transformed)
            val_predictions = classifier.predict (X_val_transformed) 
            
            metrics_dict[name] = {
            'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
            'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
            'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
            'Train F1': round(metrics.f1_score(y_train,train_predictions),2),
        
            'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
            'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
            'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
            'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)
            }
            
    return metrics_dict


In [None]:
def SMOTE_compare_vectorization_model3(X_train_col, y_train, X_val_col, y_val, classifier, smote = False):
  
    '''
    Compares the performance of a single classifier using different text vectorization methods.

    Uses SMOTE to rebalance class sizes before predictions are made and scored.

    Vectorization methods should be specified outside the function in a 'vectorization list',
    which is a list of tuples specifying each name and vectorization method to be used.

    Parameters:
    
    X_train_col: cleaned text column in training set
    y_train: target variable in training set    
    X_val_col: cleaned text column in validation set
    y_val: target variable in validation set 
    classifier: name of classifier; uses default parameters if none are specified

    '''   
    metrics_dict = {}
    
    for name, vectorizer in vectorization_list:

        if smote == True:
                      
            X_train_transformed = vectorizer.fit_transform(X_train_col)
            X_val_transformed = vectorizer.transform (X_val_col)
        
            smote = SMOTE(random_state = 1, sampling_strategy = 'not majority')
    
            pipe = make_pipeline(smote, classifier) 
    
            model = pipe.fit(X_train_transformed, y_train.values.ravel())
    
            train_predictions = model.predict(X_train_transformed)
            val_predictions = model.predict (X_val_transformed)
            
            metrics_dict[name] = {
            'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
            'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
            'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
            'Train F1': round(metrics.f1_score(y_train,train_predictions),2),
        
            'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
            'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
            'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
            'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)
            }
                            
        else:
                                
            X_train_transformed = vectorizer.fit_transform(X_train_col)
            X_val_transformed = vectorizer.transform (X_val_col)

            classifier.fit(X_train_transformed, y_train.values.ravel())
    
            train_predictions = classifier.predict (X_train_transformed)
            val_predictions = classifier.predict (X_val_transformed) 
            
            metrics_dict[name] = {
            'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
            'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
            'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
            'Train F1': round(metrics.f1_score(y_train,train_predictions),2),
        
            'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
            'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
            'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
            'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)
            }
            
    return metrics_dict

In [None]:
def smote_classify (X_train_col, y_train, X_val_col, y_val):
            
    X_train_transformed = vectorizer.fit_transform(X_train_col)
    X_val_transformed = vectorizer.transform (X_val_col)

    smote = SMOTE(random_state = 1, sampling_strategy = 'not majority')

    pipe = make_pipeline(smote, classifier) 

    model = pipe.fit(X_train_transformed, y_train.values.ravel())

    train_predictions = model.predict(X_train_transformed)
    val_predictions = model.predict (X_val_transformed)

    metrics_dict[name] = {
    'Train Accuracy' : round(metrics.accuracy_score(y_train, train_predictions),2),
    'Train Precision' : round(metrics.precision_score(y_train, train_predictions),2),
    'Train Recall' : round(metrics.recall_score(y_train, train_predictions),2),
    'Train F1': round(metrics.f1_score(y_train,train_predictions),2),

    'Validation Accuracy': round(metrics.accuracy_score(y_val, val_predictions),2),
    'Validation Precision' : round(metrics.precision_score(y_val, val_predictions),2),
    'Validation Recall': round(metrics.recall_score(y_val, val_predictions),2),
    'Validation F1': round(metrics.f1_score(y_val, val_predictions),2)}

# Word to Vec

In [1]:
def tsne_plot(model):
    '''
    Plots the word embeddings created by the word2vec model
    
    Parameters:
    
    model: variable name for word2vec model
    
    '''
    
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=250, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

#tnse_plot(model_w2v)

### Extra

In [2]:
def avg_word_vectors(wordlist, size, model_name):
    '''
    Calculate the mean word embedding for every sentence in the dataset.
        
    Parameters:
    
    wordlist: list of the list of words in each sentence
    
    size: size of hidden layer
    
    model_name: name of wv2 model
    
    '''

    sumvec = np.zeros(shape = (1,size))
    wordcnt = 0
    
    for word in wordlist:
        if word in model_name:
            sumvec += model_name[w]
            wordcnt +=1
    
    if wordcnt == 0:
        return sumvec
    
    else:
        return sumvec / wordcnt


In [10]:
def pca_smote_w2v_model_testcopy(X_train_w2v, y_train, X_val_w2v, y_val, classifier):
    '''
    Conducts PCA (number of components = 10) and SMOTE to correct target class imbalance before 
    testing specified classifer with mean embeddings from w2v model. 
    
    Parameters:
    
    X_train_w2v: variable containing mean word embedding array for training data
    
    X_val_w2v: variable containing mean word embedding array for training data

    '''    
   
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
    
    pca = decomposition.PCA(n_components=10)
    
    pipe = make_pipeline(pca, smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test