# <span style="color:green">Geolocalisation -  Notebook 1.1 Data preprocessing</span>
## <span style="color:green">0. Load libraries and data</span>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

## Since most of the data has already been trated and filtered, the code for uploading raw data is mostly commented
# df = pd.read_table('data/Geo_data.GT.h.txt', sep="\t", header=0) 

## <span style="color:green">1. Formatting the data</span>
### <span style="color:green">1.1 Remove last column as it has no information</span>

In [4]:
df = df.drop(df.columns[434], axis=1) # Remove last column as it has no information
dft = df.T # transpose data.frame

NameError: name 'df' is not defined

### <span style="color:green">1.2 Save data object with pickle / read data obtect with pickle</span>

In [4]:
#dft.to_pickle("./pkl/dft.pkl.gz", compression="infer") # Save object with pickle
#dft = pd.read_pickle("./pkl/dft.pkl.gz", compression="infer") # Import object with pickle

In [3]:
dft.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2690716,2690717,2690718,2690719,2690720,2690721,2690722,2690723,2690724,2690725
Chinko-16,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,0,2,0,0,0
Chinko-3,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,0,2,0,0,0
Chinko-5,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,0,2,0,0,0
CMNP1-8,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,0,2,0,0,0
Cnp1-1,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,0,2,0,0,0


### <span style="color:green">1.3 Replace missing (which is denoted with a 9) data by NaN</span>

In [4]:
dft_c = dft.replace(9, np.nan) # Replace 9 by NaN. Note that here NaN is usued as a synonyn for NA

(434, 2690726)

In [7]:
dft_c.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2690716,2690717,2690718,2690719,2690720,2690721,2690722,2690723,2690724,2690725
Baf1-12,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
Baf2-46,,,,,,,,,,,...,,,,,,,,,,
Cnp1-14,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,,,,,
Cnp1-2,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
Cnp1-36,,,,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


### <span style="color:green">1.4 Filter out non-informative positions</span>
<span style="color:green"> Check whether all column values show the same values (either NaN or the same genotype)</span>

In [11]:
values = [] # Create an empty list in which to store the count of unique values per column
for column in range(len(dft_c.columns)): # Iterating over each column
    values.append(len(dft_c.iloc[:,column].dropna().unique())) # Ignoring NaNs, then count unique values

<span style="color:green"> The list **values** will be 0 when all samples have a NaN for the same column and 1 when they all have the same genotype. So filter out the columns with **values** below 2. </span>

In [16]:
dft_c2 = dft_c.iloc[:,[i for i, e in enumerate(values) if (e >= 2)]]
dft_c2.shape

In [19]:
#dft_c2.to_pickle("./pkl/dft_c2.pkl.gz", compression="infer") # Save object with pickle
#dft_c2 = pd.read_pickle("./pkl/dft_c2.pkl.gz", compression="infer") # Import object with pickle

### <span style="color:green">1.5 Remove samples and positions with many NaNs</span>
<span style="color:green"> Even though non-informative positions have been filtered out, there are still some positions and samples which remain non-informative: these are positions with a high number of NaN for most samples and also samples which have a high number of NaN for most positions. Since we are facing a problem of high degree of missing values, it is important to consider those samples and positions that do not add a lot of information for the modelling. Note that here *NaN* is usued as a synonyn for *NA* though they are not.</span>

<span style="color:green"> Leveraging the number of samples and positions that would result from this filtering, we ended up choosing to remove positions (columns in the dataframe) with more than 80% missingness (defined as those that did not have information for 80% or more of the samples, *rows*). We also filtered out the samples (rows in the dataframe) with more than 70% missingness (with NaN for 70% of their positions, *columns*).</span>

In [36]:
# Remove columns (positions), with more than 80% missingness (no information for 80% individuals)
df_c2_filtPos80 = dft_c2.dropna(axis = "columns", thresh = int(0.8*len(dft_c2))) 
print("Original dataframe:",  df_c2_filtPos80.shape)

# Remove rows (samples), with more than 70% missingness (no information in 70% positions)
df_c2_filtPos80filtSamp70 = df_c2_filtPos80.dropna(axis = "rows", thresh = int(0.7*len(df_c2_filtPos80.columns)))
print("Filtered dataframe:", df_c2_filtPos80filtSamp70.shape)

Original dataframe: (434, 288274)
Filtered dataframe: (346, 288274)


In [27]:
df_c2_filtPos80filtSamp70.to_pickle("./pkl/df_c2_filtPos80filtSamp70.pkl.gz", compression="infer") # Save object with pickle
#df_c2_filtPos80filtSamp70 = pd.read_pickle("./pkl/df_c2_filtPos80filtSamp70.pkl.gz", compression="infer") # Save object with pickle

<span style="color:green">Different filtering criteria were tested:</span>

In [25]:
# DO NOT NEED TO RUN
# Remove columns (positions), with more than 80% missingness (no information for 80% individuals)
df_c2_filtPos80 = dft_c2.dropna(axis = "columns", thresh = int(0.8*len(dft_c2))) 
print("df_c2_filtPos80", df_c2_filtPos80.shape)

# Remove rows (samples), with more than 50% missingness (no information in 50% positions)
df_c2_filtPos80filtSamp50 = df_c2_filtPos80.dropna(axis = "rows", thresh = int(0.5*len(df_c2_filtPos80.columns)))
print("df_c2_filtPos80filtSamp50", df_train_c_filtPos80filtSamp50.shape)

# Remove rows (samples), with more than 60% missingness (no information in 60% positions)
df_c2_filtPos80filtSamp60 = df_c2_filtPos80.dropna(axis = "rows", thresh = int(0.6*len(df_c2_filtPos80.columns)))
print("df_c2_filtPos80filtSamp60", df_train_c_filtPos80filtSamp60.shape)

# Remove rows (samples), with more than 70% missingness (no information in 70% positions)
df_c2_filtPos80filtSamp70 = df_c2_filtPos80.dropna(axis = "rows", thresh = int(0.7*len(df_c2_filtPos80.columns)))
print("df_c2_filtPos80filtSamp70", df_train_c_filtPos80filtSamp70.shape)

# Remove rows (samples), with more than 80% missingness (no information in 80% positions)
df_c2_filtPos80filtSamp80 = df_c2_filtPos80.dropna(axis = "rows", thresh = int(0.8*len(df_c2_filtPos80.columns)))
print("df_c2_filtPos80filtSamp80", df_train_c_filtPos80filtSamp80.shape)

df_c2_filtPos80 (434, 288274)
df_c2_filtPos80filtSamp50 (390, 288274)
df_c2_filtPos80filtSamp60 (367, 288274)
df_c2_filtPos80filtSamp70 (346, 288274)
df_c2_filtPos80filtSamp80 (306, 288274)
