In [None]:
import pandas as pd
import os
from os.path import isfile, join
from IPython.display import display # for displaying pandas style - display(df.head())

from sklearn.externals import joblib
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score
from sklearn.utils import shuffle
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import make_pipeline

In [None]:
def definePath(name):
    '''Simple function that gets the user's name and returns its database path. '''
    if name == 'vitor':
        path = r'C:\Users\vitor\Documents\TCC.v3\Fake.br-Corpus\full_texts'
    elif name == 'lucas':
        path = r'C:\Users\nakam\Documents\Python Scripts\Fake\Fake.br-Corpus\full_texts'
    else:
        print('Não reconhecido. Vitor, é você né?')
    return path

def txtToDataframe(path):
    '''Function for converting full texts to a single DataFrame.'''
    true_files = [path+"\\true\\"+f for f in os.listdir(path+'\\true') if isfile(join(path+'\\true', f))]
    fake_files = [path+"\\fake\\"+f for f in os.listdir(path+'\\fake') if isfile(join(path+'\\fake', f))]
    
    texts = []
    labels = []
    
    for file in true_files:
        with open(file, encoding='utf8') as f:
            texts.append(f.read())
            labels.append('true')
    for file in fake_files:
        with open(file, encoding='utf8') as f:
            texts.append(f.read())
            labels.append('fake')
    df = pd.DataFrame(list(zip(texts,labels)),columns=['texts','labels'])
    
    return df

def definePathEvaluate(name):
    '''Simple function that gets the user's name and returns its database from Extract path. '''
    if name == 'vitor':
        path = r'C:\Users\vitor\Documents\TCC.v3\Software'
    elif name == 'lucas':
        path = r'C:\Users\nakam\Documents\Python Scripts\Fake\Software'
    else:
        print('Não reconhecido. Vitor, é você né?')
    return path

def csvToDataFrame(path):
    feature = input('Tipo de feature a ser trabalhada: ')
    df = pd.read_csv(path+'\\'+feature+'.csv')
    df=df.drop(labels='Id',axis=1)
    return df

def getDatasetValues(df):
    y = df.loc[:,'Tag'].tolist()
    df = df.drop('Tag',axis=1)

    X = df.values

    Id = df.index.values

    X, y, Id = shuffle(X, y, Id)
    return (X, y, Id)

def predictAndEvaluate(classifier, X, y, dataset_name, lc = 5,  n_jobs = 2, feature_selection = -1, save_model = False):

    s = (np.linspace(0,1,lc+1) * len(y)).astype(np.int)[1:] #creates an array from 0.1 to 1 with 10 evenly spaced items, and multiply by the number of instances of the dataset

    predicts = []
    for val in s:
        logger.info('cross evaluating with '+ str((val/len(y))*100) + '% of corpus')
        if feature_selection > 0:
            predicts.append( cross_val_predict(make_pipeline(SelectKBest(mutual_info_classif,feature_selection),classifier), X[:val], y[:val], cv=5, verbose=False, n_jobs=n_jobs) )
        else:
            predicts.append( cross_val_predict(classifier, X[:val], y[:val], cv=5,verbose=False, n_jobs=n_jobs) )


    if save_model:
        model_name = (classifier.__class__.__name__ + '_' + (dataset_name.split('\\')[-1].split('/')[-1].split('.')[0]) + '.pkl').lower()
        classifier.fit(X, y)
        joblib.dump(classifier,model_name)


    return predicts

In [None]:
def main():
    base_path = definePathEvaluate(input('Quem é você? '))
    df = csvToDataFrame(base_path)
    print(getDatasetValues(df))
if __name__ == '__main__':
    main()