# Gene expression profiles to identify cancer types

## Importing Libraries

In [None]:
#data manipulation
import pandas as pd
import numpy as np

#data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Reading the data file

In [None]:
#read data from github?
dt = pd.read_csv('data.csv', index_col = 0)
lb = pd.read_csv('labels.csv', index_col = 0)

## Data exploration and cleaning

In [None]:
#merging the data with labels

data_frame = pd.merge(dt, lb, left_index=True, right_index=True)
print(data_frame.columns)
print (data_frame.shape)

#Rename last column as Cancer_Type
data_frame = data_frame.rename(columns={'Class': 'Cancer_Type'})
print(data_frame.columns)

In [None]:
#Check for missing values - this summs the amount of null occurrences in the data set and safe the columns with null values in variable null
datanull = data_frame.isnull().sum() 
null = [i for i in datanull if i > 0]

print('The columns with missing values are:%d'%len(null))

In [None]:
#Checking how many type of cancer we have
print(data_frame['Cancer_Type'].value_counts())

#Visualizing
data_frame['Cancer_Type'].value_counts().plot.bar(color='purple')

# Add labels and title
plt.xlabel('Cancer Types')
plt.ylabel('# samples per type')
plt.title('Amount of samples per cancer type')

The different amount of samples per cancer type is an indication of class invalanze, we will have to select a model that accounts for this. But in this exercise we are going to use a regresion.

In [None]:
#Separating feature values from class (cancer type)
X = data_frame.iloc[:, :-1]  # Features (gene expression levels)
y = data_frame.iloc[:, -1]   # Target variable (last column)


## Encoding the lables 
Since the class (cancer type) is categorical we will need to convert (encode) them to numeric to be able to perform the analysis.

In [None]:
#using LabelEncoder from sklearn 
label_encoder = LabelEncoder()
label_encoder.fit(y)
y_encoded = label_encoder.transform(y)
labels = label_encoder.classes_
classes = np.unique(y_encoded)
print(labels)
print(classes)

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#regression model 
X = sm.add_constant(X)  # Add intercept
model = sm.OLS(y_encoded, X).fit()

print(model.summary())

In [None]:
#spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y_encoded,test_size=0.2, random_state=42)
