---
title: "Analysis of Hyper-parameters on Single Cell RNA-Seq Data"
format:
    html:
        toc: true
        highlight-style: ayu
---

In [67]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import silhouette_score, silhouette_samples
import scanpy as sc
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.figure_factory as ff


## Getting the Data

We will analyze three types of hyper-parameters: Perplexity on t-SNE, number of clusters chosen from an unsupervised method and how these affect the quality of the selected features and the type of regularization.

To performs these three tasks we are gonna work with real data. Specifically, we're gonna use scRNA-Seq data from a brain sample (GSM6900730). The data is available on https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM6900730.  

In [68]:
# download the data and load it for the posterior analysis
import urllib.parse
import requests
import urllib
from bs4 import BeautifulSoup
import os
from ftplib import FTP

url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM6900730"

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table')
links = table.find_all('a')

download_dir = os.getcwd()

for link in links:
    # Get the URL of the link
    link_url = link.get('href')
    # Check if the link contains 'ftp'
    if link_url and 'ftp' in link_url:
        # Parse the FTP URL
        link_url = urllib.parse.unquote(link_url)
        parsed_url = urllib.parse.urlparse(link_url)
        hostname = parsed_url.hostname
        path = parsed_url.path
        
        # Connect to the FTP server
        ftp = FTP(hostname)
        ftp.login()
        ftp.cwd(os.path.dirname(path))
    
        # Extract the file name from the path
        file_name = os.path.basename(path)
        local_file_path = os.path.join(download_dir, file_name)
        
        with open(local_file_path, "wb") as local_file:
            ftp.retrbinary(f"RETR {file_name}", local_file.write)
    
        ftp.quit()
        print(f"Downloaded {file_name}")
        


Downloaded GSM6900730_JLE16_B1_barcodes.tsv.gz
Downloaded GSM6900730_JLE16_B1_features.tsv.gz
Downloaded GSM6900730_JLE16_B1_matrix.mtx.gz


In [69]:
scdata = sc.read_10x_mtx('D:/Data Analysis Statistical Modeling and Computation in Applications/data/scrnaseq data/',
                         var_names = 'gene_symbols',
                         cache=True)

X_sc = scdata.X.toarray()
