# **Pre Processamento**

## **Dados da coleta**

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Pre processamento dos dados da coleta
coleta = pd.read_csv('dados/Coleta115respostas.csv')

# Removendo colunas desnecessárias
coleta = coleta.drop(['Carimbo de data/hora', 'Declaro que '], axis=1)

# Transformando as colunas em uma única coluna, adicionando abaixo
coleta = coleta.stack().reset_index(drop=True).to_frame('Sequências')

# Removendo sequências pequenas e não binárias
coleta = coleta[coleta['Sequências'].str.len() == 50]
coleta = coleta[coleta['Sequências'].str.contains('^[01]+$')]

# Removendo duplicatas
def removeDupes(df):
    qtd = df.shape
    
    df = df.drop_duplicates().reset_index(drop=True)
    
    loss = qtd[0] - df.shape[0]

    print(f'Quantidade original: {qtd[0]}')
    print(f'Quantidade atual: {df.shape[0]} - Loss: {loss} ({(loss) /  (qtd[0]) * 100:.2f}%)')
    
    return df

coleta = removeDupes(coleta)

coleta

Quantidade original: 494
Quantidade atual: 488 - Loss: 6 (1.21%)


Unnamed: 0,Sequências
0,0010101001010101010101010000111101010001010101...
1,0010101001001001001001010101101001010010101010...
2,0001001001010010101111111111010101001001001010...
3,1001010101001010101011100000000010010100100101...
4,0101010101010100010100101011010111010010100010...
...,...
483,1111011010110001100001100000110001100001000010...
484,1000101010100011100001100001101010011010101001...
485,1010101000110011110101000110010000111010011100...
486,1010100000111101010100101010010101101010101001...


## **Dados do MindingTheData**

In [3]:
# Pre processamento dos dados do MindingTheData
mtd = pd.read_csv('dados/mindingthedata.csv')

# Removendo coluna desnecessária
mtd = mtd.drop('Unnamed: 0', axis=1)

# Transformando cada H em 0 e cada T em 1
mtd['Sequências'] = mtd['Flip'].apply(lambda x: [1 if i == 'H' else 0 for i in x]).apply(lambda x: ''.join(str(i) for i in x))
mtd = mtd.drop(['Flip'], axis=1)

mtd

Unnamed: 0,Sequências
0,1010111010100001010101101011010110101011110010...
1,1101010000101010111100010101010101100101000010...
2,1110001010101110001010101111000001010101000011...
3,1000001010111111001000101010111100010101110001...
4,1100101010110100001101011101110001111010111000...
...,...
80,1000101011110010111000101111100111100000101010...
81,0000111001011111001010000100001000101100000111...
82,1000101011111001000010111110010101000101011110...
83,1101000100001010011100110011111100010100101110...


## **Criando dataset de sequências**

In [4]:
# Concatenando os dois datasets
sequencias = pd.concat([coleta, mtd]).reset_index(drop=True)

# Removendo duplicatas
sequencias = removeDupes(sequencias)

sequencias

Quantidade original: 573
Quantidade atual: 573 - Loss: 0 (0.00%)


Unnamed: 0,Sequências
0,0010101001010101010101010000111101010001010101...
1,0010101001001001001001010101101001010010101010...
2,0001001001010010101111111111010101001001001010...
3,1001010101001010101011100000000010010100100101...
4,0101010101010100010100101011010111010010100010...
...,...
568,1000101011110010111000101111100111100000101010...
569,0000111001011111001010000100001000101100000111...
570,1000101011111001000010111110010101000101011110...
571,1101000100001010011100110011111100010100101110...


Adicionando as sequências "aleatórias" (geradas por um gerador de números pseudo-aleatórios)

In [5]:
from random import choices

# Criando um dataset de mesma estrutura do dataset de sequências humanas
aleatorias = pd.DataFrame(columns=['Sequências'])

# Gerando o mesmo número de sequências do dataset de sequências humanas e inserindo no dataframe aleatorias
aleatorias['Sequências'] = [''.join(choices('01', k=50)) for _ in range(sequencias.shape[0])]

# Adicionando as classes
sequencias['Classe'] = 'Humana'
aleatorias['Classe'] = 'Aleatória'

# Concatenando os dois datasets
sequencias = pd.concat([sequencias, aleatorias]).reset_index(drop=True)

# Embaralhando as linhas
sequencias = sequencias.sample(frac=1).reset_index(drop=True)

sequencias

Unnamed: 0,Sequências,Classe
0,0110010111000111001101101101011011011010111010...,Humana
1,1011100010110111011110101010101110010100110101...,Humana
2,0000001100010100011101011011100001010101011101...,Aleatória
3,1100011100001111111111101111001111110000000000...,Humana
4,1100001001001110011100000100010101001001100001...,Aleatória
...,...,...
1141,1101101001000101010101110101010000001001001111...,Aleatória
1142,0100000100000100001000010000100001000001000100...,Humana
1143,0001010111100001010111100010101111000000101000...,Humana
1144,1110001010101110001010101111000001010101000011...,Humana


In [6]:
# Exportando os dados para um arquivo csv
sequencias.to_csv('sequencias-preprocessadas.csv', index=False)