In [1]:
import pandas as pd
import numpy as np
import os
from os.path import isfile, join
from IPython.display import display # for displaying pandas style - display(df.head())

import joblib
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import classification_report , confusion_matrix, accuracy_score
from sklearn.utils import shuffle

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import make_pipeline

In [2]:
def definePath(name):
    '''Simple function that gets the user's name and returns its database path. '''
    if name.lower() == 'vitor':
        path = r'C:\Users\vitor\Documents\TCC.v3\Fake.br-Corpus\full_texts'
    elif name.lower() == 'lucas':
        path = r'C:\Users\nakam\Documents\Python Scripts\Fake\Fake.br-Corpus\full_texts'
    else:
        print('Não reconhecido. Vitor, é você né?')
    return path

def txtToDataframe(path):
    '''Function for converting full texts to a single DataFrame.'''
    true_files = [path+"\\true\\"+f for f in os.listdir(path+'\\true') if isfile(join(path+'\\true', f))]
    fake_files = [path+"\\fake\\"+f for f in os.listdir(path+'\\fake') if isfile(join(path+'\\fake', f))]
    
    texts = []
    labels = []
    
    for file in true_files:
        with open(file, encoding='utf8') as f:
            texts.append(f.read())
            labels.append('true')
    for file in fake_files:
        with open(file, encoding='utf8') as f:
            texts.append(f.read())
            labels.append('fake')
    df = pd.DataFrame(list(zip(texts,labels)),columns=['texts','labels'])
    
    return df

def definePathEvaluate(name):
    '''Simple function that gets the user's name and returns its database from Extract path. '''
    if name.lower() == 'vitor':
        path = r'C:\Users\vitor\Documents\TCC.v3\Software'
    elif name.lower() == 'lucas':
        path = r'C:\Users\nakam\Documents\Python Scripts\Fake\Software'
    elif name.lower() == 'vitor-completo':
        path = r'C:\Users\vitor\Documents'
    else:
        print('Não reconhecido. Vitor, é você né?')
    return path

def csvToDataFrame(path):
    feature = input('Tipo de feature a ser trabalhada: ')
    df = pd.read_csv(path+'\\'+feature+'.csv')
    df=df.drop(labels='Id',axis=1)
    return df, feature

def getDatasetValues(df): # returns each series separately
    y = df.loc[:,'Tag'].tolist()
    df = df.drop('Tag',axis=1)

    X = df.values

    Id = df.index.values

    X, y, Id = shuffle(X, y, Id)
    return (X, y, Id)

def predictAndEvaluate(classifier, X, y, dataset_name,  n_jobs = 2, feature_selection = 100, save_model = False):

    #s = (np.linspace(0,1,lc+1) * len(y)).astype('int32')[1:] #creates an array from 0.1 to 1 with 10 evenly spaced items, and multiply by the number of instances of the dataset

    predicts = []
    if feature_selection > 0:
        predicts.append( cross_val_predict(make_pipeline(SelectKBest(mutual_info_classif,k=feature_selection),classifier), X, y, cv=5, n_jobs=n_jobs) )
    else:
        predicts.append( cross_val_predict(classifier, X, y, cv=5, n_jobs=n_jobs) )


    if save_model:
        model_name = (classifier.__class__.__name__ + '_' + (dataset_name + '.pkl').lower())
        classifier.fit(X, y)
        joblib.dump(classifier,model_name)


    return predicts

## IDs de classificadores:
0. SVM
1. Random Forests
2. Naive-Bayes 


*Obs.: Em predictAndEvaluate, não está sendo aplicado o cross validation de slicings, somente o cross_val_predict. Será que isso resulta a mesma coisa?*

In [4]:
def main():
    base_path = definePathEvaluate(input('Quem é você? '))
    df,feature = csvToDataFrame(base_path)
    feature = str(feature)
    #display(df.head())

    X,y,Id = getDatasetValues(df)
    
    classifierID = int(input('Insira o ID do classificador: '))
    classifierList = [LinearSVC(), MultinomialNB(), RandomForestClassifier(), LogisticRegression()]
    classifier = classifierList[classifierID]
    
    predictions = (predictAndEvaluate(classifier,X,y,dataset_name = feature)[0]).tolist()
    
    match = 0
    for item in list(zip(list(y),predictions)):        
        if item[0]==item[1]:
            match += 1
    print('Acurácia: {:.4f}'.format( match/len(y) ))
    print(list(zip(y,predictions)))
    
if __name__ == '__main__':
    main()

Quem é você? Vitor-completo
Tipo de feature a ser trabalhada: tf-idf-completo-spacy
Insira o ID do classificador: 2


MemoryError: Unable to allocate 1.22 GiB for an array with shape (5760, 28518) and data type float64