In [None]:
"""
  Part 2: Train and evaluate a model
      - rescale the variables
      - create train and test set
      
"""

In [18]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
# read in the features csv that was created
PTH_DATA = 'C:/6200-IR/homework-6-mplatt27/features.csv'
features = pd.read_csv(PTH_DATA)
features.set_index('query-docid', inplace=True, drop=True)
features.drop(['Unnamed: 0'], axis=1, inplace=True)
features.head()

Unnamed: 0_level_0,q_id,label,bm25,laplace,jm,tfidf,okapi
query-docid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
54-AP890306-0169,54,1,27.931819,-92.0094,-40.259173,11.417692,2.178449
54-AP890328-0062,54,1,15.629973,-93.468869,-44.081304,6.206999,1.356581
54-AP890622-0082,54,1,25.712165,-93.211816,-40.934508,10.204885,1.921698
54-AP890118-0061,54,1,26.097545,-91.504046,-41.498399,10.332642,2.038044
54-AP891220-0132,54,1,23.732574,-94.021455,-41.175104,9.069493,1.787554


In [3]:
# scale variables
rs = RobustScaler()
columns = ['bm25', 'laplace', 'jm', 'tfidf', 'okapi']
features[columns] = rs.fit_transform(features[columns])
features.head()

Unnamed: 0_level_0,q_id,label,bm25,laplace,jm,tfidf,okapi
query-docid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
54-AP890306-0169,54,1,2.429305,-1.239644,-0.978838,2.917749,1.783206
54-AP890328-0062,54,1,0.663166,-1.279856,-1.19219,0.856552,0.635582
54-AP890622-0082,54,1,2.110636,-1.272773,-1.016535,2.437998,1.424689
54-AP890118-0061,54,1,2.165964,-1.22572,-1.048012,2.488535,1.58715
54-AP891220-0132,54,1,1.826432,-1.295081,-1.029965,1.98887,1.237376


In [24]:
# functions for training the model with cross validation

def get_query_ids():
    """ Return an array of all query ids that exist """
    q = np.array([85, 59, 56, 71, 64, 62, 93, 99, 58, 77, 54, 87, 94,
              100, 89, 61, 95, 68, 57, 97, 98, 60, 80, 63, 91])
    return q

def get_train_test_q_ids(q, i):
    """ 
    Split into 20 train and 5 test queries, such that there are 5 iterations, and
    we want all queries to appear in the test group once. 
    """
    tst = q[i*5:i*5+5]
    trn = np.setdiff1d(q,tst)
    return trn, tst

def get_train_test_features(train_ids, test_ids):
    """
    Get the features that are in the train and test split in a dataframe. 
    """
    tst = features[features['q_id'].isin(test_ids)]
    trn = features[features['q_id'].isin(train_ids)]
    return trn, tst


def create_list_from_df(tr_x, tr_y, tst_x, tst_y):
    """
      Transform the dataframe into lists of features and labels, since this is
      what the LR function needs as input. 
    """
    tr_x_list = []
    for rows in tr_x.itertuples():
        tmp_list =[rows.bm25, rows.laplace, rows.jm, rows.tfidf, rows.okapi]
        tr_x_list.append(tmp_list)
    
    tr_y_list = []
    for rows in tr_y.itertuples():
        tr_y_list.append(rows.label)

    tst_x_list = []
    for rows in tst_x.itertuples():
        tmp_list =[rows.bm25, rows.laplace, rows.jm, rows.tfidf, rows.okapi]
        tst_x_list.append(tmp_list)
    
    tst_y_list = []
    for rows in tst_y.itertuples():
        tst_y_list.append(rows.label)
        
    return tr_x_list, tr_y_list, tst_x_list, tst_y_list
    
    
def write_results(scores, i, tst):
    """ Write a results file to test with trec eval """
    
    name = 'test_scores_' + str(i) + '.txt'
    if os.path.exists('C:/6200-IR/homework-6-mplatt27/' + name):
        os.remove('C:/6200-IR/homework-6-mplatt27/' + name)
    o = open('C:/6200-IR/homework-6-mplatt27/' + name, "w")
    
    rank = 1
    j = 0
    pair = tst.index[0]
    last_q_id_end = pair.find("-")
    last_q_id = str(pair[:last_q_id_end]).strip()
    for i, row in tst.iterrows():
        pair = str(i)
        q_id_end = pair.find("-")
        q_id = str(pair[:q_id_end]).strip()
        if q_id != last_q_id:
            rank = 1
            last_q_id = q_id
        docid_start = pair.find("A")
        docid = str(pair[docid_start:]).strip()
        score = str(scores[j])
        r = str(rank)
        o.write(q_id + " " + "Q0" + " " + docid + " " + r + " " + score + " " + "Exp\n")
        rank += 1
        j += 1
    o.close()
    return


def write_results_train(scores, i, tr):
    """ Write a results file to test with trec eval for the training data """
    
    name = 'train_scores_' + str(i) + '.txt'
    if os.path.exists('C:/6200-IR/homework-6-mplatt27/' + name):
        os.remove('C:/6200-IR/homework-6-mplatt27/' + name)
    o = open('C:/6200-IR/homework-6-mplatt27/' + name, "w")
    
    rank = 1
    j = 0
    pair = tr.index[0]
    last_q_id_end = pair.find("-")
    last_q_id = str(pair[:last_q_id_end]).strip()
    for i, row in tr.iterrows():
        pair = str(i)
        q_id_end = pair.find("-")
        q_id = str(pair[:q_id_end]).strip()
        if q_id != last_q_id:
            rank = 1
            last_q_id = q_id
        docid_start = pair.find("A")
        docid = str(pair[docid_start:]).strip()
        score = str(scores[j])
        r = str(rank)
        o.write(q_id + " " + "Q0" + " " + docid + " " + r + " " + score + " " + "Exp\n")
        rank += 1
        j += 1
    o.close()
    return


def train_cross_validation(m):
    """ Train a LR model with cross-validation and write results to files """
    q_ids = get_query_ids()
    
    for i in range(5):
        print("Training session: ", i)
        
        # retrieve which q_ids will be train and test this round
        train_q_ids, test_q_ids = get_train_test_q_ids(q_ids, i)
        print(test_q_ids)
        
        # get the features for each
        train_features, test_features = get_train_test_features(train_q_ids, test_q_ids)
        
        # split x and y
        x_train, y_train, x_test, y_test = train_features.iloc[:,2:], train_features.iloc[:,1:2], \
                                            test_features.iloc[:, 2:], test_features.iloc[:,1:2]
        
        # put into the format that we need for LR function
        x_train_l, y_train_l, x_test_l, y_test_l = create_list_from_df(x_train, y_train, x_test, y_test)
        
        if m == 'lr':
            # train model and print results for test
            lr = LogisticRegression(max_iter=1000, solver='liblinear', C=0.01, penalty='l1')
            lr.fit(x_train_l, y_train_l)
            write_results(lr.predict_proba(x_test)[:,1], i, y_test)
            
            # run model on training queries
            write_results_train(lr.predict_proba(x_train)[:,1], i, y_train)
            
        elif m == "sv":
            sv = SVC(C=1, gamma=0.4, kernel='rbf', probability=True)
            sv.fit(x_train_l, y_train_l)
            write_results(sv.predict_proba(x_test)[:,1], i, y_test)
            
            # run model on training queries
            write_results_train(sv.predict_proba(x_train)[:,1], i, y_train)
        
        
        
    print("Done!")


In [25]:
# train using logistic regression
train_cross_validation("lr")

Training session:  0
[85 59 56 71 64]
Training session:  1
[62 93 99 58 77]
Training session:  2
[ 54  87  94 100  89]
Training session:  3
[61 95 68 57 97]
Training session:  4
[98 60 80 63 91]
Done!


In [26]:
# train using SVM
train_cross_validation("sv")

Training session:  0
[85 59 56 71 64]
Training session:  1
[62 93 99 58 77]
Training session:  2
[ 54  87  94 100  89]
Training session:  3
[61 95 68 57 97]
Training session:  4
[98 60 80 63 91]
Done!
