# Preparing tweets for labeling

In [1]:
import pandas as pd

import os

In [None]:
tweets = pd.read_csv('./data/tweets_labeled.csv', sep = ',')

In [85]:

def crear_datasets(dataset, scenario):
    
    """ 
        This method is in charge of loading or creating the respective
        datasets for each scenario. First it checks that the locations where 
        the files are saved exist, if not it creates them, then it checks if the 
        respective files already exist, if so it creates the dataset from the
        reading of the respective file, if not, it creates the dataset from its 
        position and the state of the parameters.
        
        dataset (DataFrame) = original dataset containing all the data already arranged and ready to be used
        scenario (int) = scenario for which datasets are to be generated or loaded 
        
        return: three DataFrame Objects by a yield (data1, data2, data3) """
    
    scenario = scenario-1
    
    scenarios = ['./data/scenario1/','./data/scenario2/','./data/scenario3/']
    
    for scene in scenarios:
        if os.path.exists(scene):
            pass
        else:
            os.mkdir(scene)

    datasets = ['dataset_30','dataset_60','dataset_100']
    
    if scenario>(len(scenarios)) or scenario<0:
        return 0
    
    for i in range(len(datasets)):
        set = datasets[i]
        cadena = f"{scenarios[scenario]}{set}.csv"
        
        if os.path.exists(cadena):
            #print("lo esta leyendo")
            yield pd.read_csv(cadena, sep = ',')
            
        else:
            #print("lo esta creando")
            switch_percentage = {
                0 : 0.3,
                1 : 0.6,
                2 : 1
            }
            
            percentage = switch_percentage.get(i, 1)
            
            quantity = round(len(dataset)*percentage)
            
            dataset_s = dataset.sample(n=quantity)
            
            dataset_s.to_csv(cadena, index=True)
            
            yield dataset_s
            

In [None]:
data1, data2, data3 = crear_datasets(tweets, 1)