In [1]:
# EDA e Visualização de Dados
import pandas as pd
import plotly.express as px
import seaborn as sns

# ML / tSNE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.manifold import TSNE

### Carregar os dados

In [2]:
# Carregar DataFrame

df_celulas = pd.read_csv('./datasets/synthetic_cell_data.csv')

In [3]:
# Visualizar a estrutura

df_celulas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 51 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gene_1     1200 non-null   float64
 1   gene_2     1200 non-null   float64
 2   gene_3     1200 non-null   float64
 3   gene_4     1200 non-null   float64
 4   gene_5     1200 non-null   float64
 5   gene_6     1200 non-null   float64
 6   gene_7     1200 non-null   float64
 7   gene_8     1200 non-null   float64
 8   gene_9     1200 non-null   float64
 9   gene_10    1200 non-null   float64
 10  gene_11    1200 non-null   float64
 11  gene_12    1200 non-null   float64
 12  gene_13    1200 non-null   float64
 13  gene_14    1200 non-null   float64
 14  gene_15    1200 non-null   float64
 15  gene_16    1200 non-null   float64
 16  gene_17    1200 non-null   float64
 17  gene_18    1200 non-null   float64
 18  gene_19    1200 non-null   float64
 19  gene_20    1200 non-null   float64
 20  gene_21 

In [4]:
# Visualizar os primeiros registros

df_celulas.head(10)

Unnamed: 0,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_42,gene_43,gene_44,gene_45,gene_46,gene_47,gene_48,gene_49,gene_50,cell_type
0,-2.645739,1.084029,1.033785,-0.492039,2.031826,-1.577436,-3.064917,7.414324,5.415871,-0.189912,...,3.113416,2.623661,-3.358139,4.445828,-1.543928,0.403476,4.233371,2.028571,1.281061,type_4
1,2.501869,-2.607423,0.254079,1.983542,-12.577327,3.64329,0.66207,2.639538,-0.270477,-0.060948,...,-0.713175,-4.108856,12.761196,-0.492549,-0.567835,0.541446,0.951115,5.316995,-2.845845,type_0
2,0.867952,6.024128,-0.025372,-0.639283,-6.976072,-4.469016,-0.079012,-0.494158,-0.422986,-0.397785,...,-0.885718,5.124325,-7.550761,-1.299736,-0.676549,0.895681,2.608146,6.540721,-5.255593,type_1
3,-5.450764,-1.251932,1.47989,-0.006988,7.958121,-6.493759,5.254006,14.731454,1.216928,-0.161186,...,-2.450433,-0.817676,4.400203,3.912871,0.578546,0.101902,-0.678812,1.194707,-0.450927,type_4
4,1.646937,5.899848,1.712686,-0.267201,0.736141,-5.839495,4.971377,20.075666,2.032173,-1.333014,...,0.26437,-14.056835,-0.752921,-2.6536,-1.22204,0.774236,-0.751898,2.663342,11.293108,type_4
5,7.985732,2.028616,-2.108824,-0.507353,8.291373,-2.446036,4.601581,0.004283,-2.517259,1.08168,...,5.231475,-26.485388,12.911786,-0.047978,0.155614,0.1989,-6.363055,3.607347,5.009559,type_3
6,1.958809,2.266977,1.011023,-0.405092,-16.608731,-2.589556,-6.015156,18.677365,0.578499,0.991975,...,-2.334598,-13.740739,-19.491929,-4.538443,0.979981,-0.214577,3.32553,0.553118,-4.690824,type_2
7,2.874223,3.364924,-0.203,1.035186,-6.790603,4.026036,-2.774885,-1.960487,1.612896,0.544958,...,4.415724,11.347547,-1.840424,1.171177,-0.407957,0.892767,-0.894967,-3.244944,2.463727,type_2
8,-0.512986,0.035735,-0.593628,1.303347,4.479463,1.273131,-1.787505,9.574,4.306202,-0.531092,...,-9.036559,-6.6281,-7.619347,1.489295,-0.051405,0.670841,5.828608,5.832769,-4.367958,type_0
9,4.40653,4.170972,0.207299,1.343522,-30.853167,1.801622,3.651366,16.729529,3.913599,-1.534798,...,-1.297853,-27.112376,0.928715,-2.415031,-0.581861,-0.878507,0.295091,-0.311437,-4.465743,type_0


In [5]:
# Visualizar os ultimos registros

df_celulas.tail(10)

Unnamed: 0,gene_1,gene_2,gene_3,gene_4,gene_5,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_42,gene_43,gene_44,gene_45,gene_46,gene_47,gene_48,gene_49,gene_50,cell_type
1190,5.130883,0.476906,0.572684,0.164194,-16.303079,-0.804524,3.025086,-0.202351,0.07962,0.762356,...,2.995797,1.908208,12.695648,-3.788416,-1.53276,-0.631514,3.304645,2.970055,-3.221716,type_1
1191,2.419617,0.793071,-1.387455,0.297629,-6.548332,1.896034,4.083528,9.176024,-0.274468,0.304089,...,-0.435532,-16.538046,4.923776,-5.460043,-1.109134,0.554562,-5.616602,0.834929,0.344466,type_3
1192,2.809619,-1.722702,-0.212278,0.414738,0.450263,-2.49136,3.639747,2.140902,-4.139727,0.232016,...,-3.505924,-1.198611,4.454946,-2.079284,0.73621,0.189327,3.129614,-3.765446,-2.256419,type_1
1193,-0.279306,3.419588,0.36396,-0.874726,0.121532,-3.953477,-2.105815,11.134908,2.993356,0.265918,...,0.751961,6.065656,-11.78104,1.542127,0.7942,-0.512953,-2.07552,-4.623439,3.917674,type_2
1194,4.480948,-4.003676,0.43895,0.874355,-15.558963,-4.308776,2.512687,5.345428,-1.813045,-0.12314,...,-6.391325,-8.222492,-0.986005,-3.294662,-0.414454,-1.351131,3.114485,-2.370096,-1.063343,type_1
1195,0.312161,1.812019,-2.533336,-1.102344,0.533694,3.347571,1.031235,18.645721,0.134931,-0.563479,...,-2.622969,-17.786068,-2.588984,-1.487115,-0.728954,-0.157001,2.504064,6.077598,-2.864014,type_0
1196,-0.501553,1.832728,0.931607,1.24492,-4.852209,-2.07282,4.101321,13.203193,0.815943,-1.309194,...,-0.185366,-15.970622,4.854048,3.852804,0.432747,1.112865,3.58136,7.639363,1.129314,type_0
1197,-4.075088,2.920439,-0.531233,-2.396565,7.180632,-2.219239,6.28906,24.722109,3.490356,1.134496,...,-5.551087,-11.637883,0.058639,-1.359873,-0.045205,-2.153882,-2.941699,3.113466,1.521064,type_4
1198,2.486184,1.080221,-1.683984,-0.066772,1.228014,1.656599,1.918117,3.275596,-4.283232,-0.107426,...,-0.280397,-17.671052,5.620615,3.957635,0.082049,-0.463108,-2.063854,2.857698,0.33981,type_3
1199,2.043415,-1.441373,-0.324062,1.639342,0.996307,-4.79481,-1.529178,19.241956,4.163896,-0.15091,...,0.517168,-17.306089,-10.024047,-0.980584,0.694392,1.440378,5.602316,1.858224,-0.726391,type_0


### Treinar o algoritmo t-SNE

In [6]:
# Copiar Dataframe original

X = df_celulas.copy()
X.drop(columns=['cell_type'], axis=1, inplace=True)

In [7]:
# Armazenar resultados do t-SNE em Dataframe

results_df = pd.DataFrame()

In [8]:
# Loop de treinamento do algoritmo, mudando o parâmetro Perplexity

for perplexity in range(5, 51,1):
    
    # Criar e treinar modelo  
    tsne = TSNE(n_components=3, perplexity=perplexity, init="random",  max_iter=1000, random_state=51)
    tsne_results = tsne.fit_transform(X)
    
    # Armazenar resultados
    temp_df = pd.DataFrame(tsne_results, columns=['Componente 1', 'Componente 2', 'Componente 3'])
    
    temp_df['Perplexity'] = perplexity
    temp_df['cell_type'] = df_celulas['cell_type'].values
    
    results_df = pd.concat([results_df, temp_df], axis=0)

In [9]:
# Reset no Índice para realizar o plot

results_df.reset_index(drop=True, inplace=True)

results_df.head(10)

Unnamed: 0,Componente 1,Componente 2,Componente 3,Perplexity,cell_type
0,7.742982,7.312292,8.682014,5,type_4
1,10.016442,-20.538885,8.358107,5,type_0
2,6.754665,-2.884792,7.85715,5,type_1
3,2.303584,18.316565,8.51068,5,type_4
4,-4.15798,11.928552,16.326801,5,type_4
5,1.129007,-2.638783,-10.924747,5,type_3
6,-5.257857,-6.398366,15.195575,5,type_2
7,9.665257,8.030924,-8.567049,5,type_2
8,-6.175986,-7.496354,5.717935,5,type_0
9,-0.402604,-9.274341,17.593081,5,type_0


### Visualizar os Resultados

In [11]:
# Gráfico 2D

fig = px.scatter(results_df, x='Componente 1', y='Componente 2', animation_frame='Perplexity', 
                 color='cell_type', title='Visualização 2D do t-SNE com variação do Perplexity')
fig.show()

In [12]:
# Gráfico 3D

fig = px.scatter_3d(results_df, x='Componente 1', y='Componente 2', z='Componente 3', animation_frame='Perplexity', color='cell_type', title='Visualização 3D do t-SNE com variação do Perplexity')
fig.show()

### Analise Visual

#### Análise visual dos clusters

Após aplicar o t-SNE em 50 dimensões de expressão gênica e projetar em 2D e 3D, é possível observar:

- Há agrupamentos bem definidos para alguns tipos de células, indicando que seus perfis de expressão diferem significativamente.
- Certos tipos celulares apresentam sobreposição parcial, sugerindo maior similaridade entre eles.
- O t-SNE conseguiu capturar variações importantes do espaço original e distribuir os tipos de células de forma estruturada no plano 2D e no espaço