In [4]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# PyDrive reference:
# https://googledrive.github.io/PyDrive/docs/build/html/index.html



import pandas as pd
import io



file_list = drive.ListFile({'q': "'1PlLnE5-p0BclOjRS11AHQmC2yTw1pj1W' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))
  
  
training_data_downloaded = drive.CreateFile({'id': '13AMkTE6bv8fC2Iw-TWFoGT_zshxVIieg'})
training_data_downloaded.GetContentFile('training_data.csv')

training_desc_downloaded = drive.CreateFile({'id': '1SLgTSdBJKXUw49wRcWPH_Kka0k0XGd-v'})
training_desc_downloaded.GetContentFile('training_desc.csv')

training_labels_downloaded = drive.CreateFile({'id': '1FRdQsuXmVgBV6w99YtyKBWsfXLjARLkG'})
training_labels_downloaded.GetContentFile('training_labels.csv')

test_data_downloaded = drive.CreateFile({'id': '1_kJMHwi-a5_BnfGSOn0G8eOXvMvQG3GD'})
test_data_downloaded.GetContentFile('test_data.csv') 


import numpy as np

def preprocess( ):
    """ This function use to read the csv file and  
        do some preprocess invoke the traing and predict function
        Input: CSV file
        Output: predicted_labels.csv
    """
    cols = pd.read_csv('training_data.csv',header=None,nrows=1).columns

    df_labels = pd.read_csv('training_labels.csv',header=None)

    # read the first colum from the training_data.csv
    wholeindex= pd.read_csv('training_data.csv',header=None,usecols={0})
    # read the other colum from the training_data.csv using type np.float32
    X_begin = pd.read_csv('training_data.csv',header=None,usecols=cols[1:],dtype=np.float32).values
    # join wholeindex and df_labels to change the apps in df_labels into same order with wholeindex
    y=wholeindex.set_index(0).join(df_labels.set_index(0)).values
    # find the unique classes in y
    classlist=np.unique(y)
    
    # read the first colum from the test_data.csv
    testname= pd.read_csv('test_data.csv',header=None,usecols={0}).values
    # read the other colum from the training_data.csv using type np.float32
    df_test = pd.read_csv('test_data.csv',header=None,usecols=cols[1:],dtype=np.float32).values
   
    #invoke the tenFloder test funciton
    # tenFloder(X_begin,y,classlist)
    
    # invoke the traning function use X_begin,y and classlist
    classpro,promatrix=calc_class_prob(X_begin,y,classlist)
    # invoke the predict function use  classpro,promatrix
    result=predict(df_test,classpro,promatrix,classlist)

    # creat the output predicted_labels.csv file and write the result into it 
    resultfile=drive.CreateFile({'title':'predicted_labels.csv'})
    resultfile.Upload()
    resultfile.GetContentFile('predicted_labels.csv') 
    resultstr=""
    for i in range(0,len(result)):          
      resultstr+=testname[i][0]+','+result[i]+'\r\n'
    resultfile.SetContentString(resultstr)
    resultfile.Upload() 
    
    

  
def calc_class_prob(X,y,classlist):
    """ This function use to training the classifier using the multinomial bayes model
        Input: X,y,classlist
        Output: classlist_prob(the possibility of every class),classword_prob(the possibility of every word in different class)
    """
    vocablary=X.shape[1]
    #define a 'zeros' array to store word probabilities in different class
    classword_prob = np.zeros((len(classlist),vocablary)) 
    #define a 'zeros' array to class probabilities
    classlist_prob = np.zeros((len(classlist),1))
    # find the class exit
    classnumber=len(classlist)
    # in different class 
    for i in range(0,classnumber):
        pos_of_class = np.where(y==classlist[i])[0]
        # find the document that belongs to this class
        tempclass=X[pos_of_class]
        # calcaulate the persentage of the class do log operation to avoid overflow
        classcpro=np.log(len(tempclass))-np.log(len(X))
        classlist_prob[i]=classcpro
         # calcaulate the word probabilities using laplace smoothing do log operation to avoid overflow
        classword_prob[i]=tempclass.sum(axis=0)
        k=sum(classword_prob[i])+vocablary
        classword_prob[i]=np.log((classword_prob[i]+1))-np.log(k) 
    return classlist_prob,classword_prob


def predict(X,classpro,promatrix,classlist):
    """ This function use to classification the input apps using their features
        Input: X,classpro,promatrix,classlist
        Output: finalresultlist(the predict features)
    """
    finalresultlist=[]
    # calcaulate the possibilities of every classes for every apps
    scorelist=np.dot(X,promatrix.transpose())+classpro.transpose()
    # for every app choose the biggest possibility class as the predict result
    for row in scorelist:
        resultclass = np.where(row==max(row))[0]
        finalresultlist.append(classlist[resultclass][0])  
    return finalresultlist
        

def tenFloder(X,y,classlist):
    """ This function use to test the preformance of the classifier using the 10-fold cross validation
        The input are the training_data.csv and training_labels.csv
        Input: X,y,classlist
        Output: performance.txt(a file include the avarage accuracy,P,R,F1) 
    """
    # assignment a array include all the index of input X
    arraylist =np.arange(X.shape[0])
    # random disorder the index
    np.random.shuffle(arraylist)
    # split the random list into 10 folder
    randomlist=np.array_split(arraylist,10)
    accuracysum=0
    RPFtotallist=[]
    # for every folder in the randomlist use 1 folder as the test set 
    # and the other 9 folder as the training set
    for item in randomlist:
        traningindex=np.setdiff1d(arraylist,item)
        test_X=X[item]
        test_y=y[item]
        traning_X=X[traningindex]
        traning_Y=y[traningindex]
        # invoke the training and predict funciton 
        classpro,promatrix=calc_class_prob(traning_X,traning_Y,classlist)
        result=predict(test_X,classpro,promatrix,classlist)
        # invoke the calcaulatePerformance function to calcaulate the performance
        accuracy,RPFlist=calcaulatePerformance(result,test_y,classlist)
        accuracysum+=accuracy
        RPFtotallist.append(RPFlist) 
    # create the output file and write the result into it
    performancefile=drive.CreateFile({'title':'performance.txt'})
    performancefile.Upload()
    performancefile.GetContentFile('performance.txt') 
    performancetr=""
    accuracyavg=accuracysum/10
    performancetr+="Accuracy: "+str(round(accuracyavg*100,2))+"%\r\n"
    RPFtotallist = np.mean(np.array(RPFtotallist), axis=0)*100
    print(RPFtotallist)
    for i in range(0,len(classlist)):
      performancetr+="%s:\r\n R: %.2f%% P: %.2f%% F: %.2f%%\r\n"%(classlist[i],RPFtotallist[i][0],RPFtotallist[i][1],RPFtotallist[i][2])
    performancefile.SetContentString(performancetr)
    performancefile.Upload() 


def calcaulatePerformance(result,y_test,classlist):
    """ This function use to calcaulate the performance using the predict result and the original labels
        Input: result,y_test,classlist
        Output: accuracy,RPFlist(a list include the P,R,F1) 
    """
    correct=[]
    RPFlist=[]
    rigth=0
    y_test=np.hstack(y_test).tolist()
    number=len(y_test)
    for i in range(0,number):
        if result[i]==y_test[i]:
            correct.append(result[i])
            rigth+=1
    accuracy= rigth/number
    for i in classlist:
        R=correct.count(i)/y_test.count(i)
        P=correct.count(i)/result.count(i)
        F1=R*P*2/(R+P)
        storestr=[R,P,F1]
        RPFlist.append(storestr)
    return accuracy,RPFlist
 

if __name__ == "__main__":
    preprocess()


title: training_desc.csv, id: 1SLgTSdBJKXUw49wRcWPH_Kka0k0XGd-v
title: training_data.csv, id: 13AMkTE6bv8fC2Iw-TWFoGT_zshxVIieg
title: training_labels.csv, id: 1FRdQsuXmVgBV6w99YtyKBWsfXLjARLkG
title: test_data.csv, id: 1_kJMHwi-a5_BnfGSOn0G8eOXvMvQG3GD
