# Introduction (Unit I) - Luisen

## 1.- Import data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

In [2]:
#Load data from csv and store it in dataframe
data = pd.read_csv("HCV-Egy-Data.csv") #We can change this database

# We show the first 
data.head() # Return the first n rows.

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue & generalized bone ache,Jaundice,Epigastric pain,...,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baseline histological Grading,Baselinehistological staging
0,56,1,35,2,1,1,1,2,2,2,...,5,5,5,655330,634536,288194,5,5,13,2
1,46,1,29,1,2,2,1,2,2,1,...,57,123,44,40620,538635,637056,336804,31085,4,2
2,57,1,33,2,2,2,2,1,1,1,...,5,5,5,571148,661346,5,735945,558829,4,4
3,49,2,33,1,2,1,2,1,2,1,...,48,77,33,1041941,449939,585688,744463,582301,10,3
4,59,1,32,1,1,2,1,2,2,2,...,94,90,30,660410,738756,3731527,338946,242861,11,1


## 2.- Data set description - "Hepatitis C Virus (HCV) for Egyptian patients" 

Egyptian patients who underwent treatment dosages for HCV about 18 months. Discretization should be applied based on expert recommendations; there is an attached file shows how.



*Age: Age
*Gender Gender
*BMI Body Mass Index
Fever Fever
Nausea/Vomting Nausea/Vomting
Headache Headache
Diarrhea Diarrhea
Fatigue & generalized bone ache Fatigue & generalized bone ache
Jaundice Jaundice
Epigastric pain Epigastric pain
WBC White blood cell
RBC red blood cells
HGB Hemoglobin
Plat Platelets
AST 1 aspartate transaminase ratio
ALT 1 alanine transaminase ratio 1 week
ALT 4 alanine transaminase ratio 12 weeks
ALT 12 alanine transaminase ratio 4 weeks
ALT 24 alanine transaminase ratio 24 weeks
ALT 36 alanine transaminase ratio 36 weeks
ALT 48 alanine transaminase ratio 48 weeks
ALT after 24 w alanine transaminase ratio 24 weeks
RNA Base RNA Base
RNA 4 RNA 4
RNA 12 RNA 12
RNA EOT RNA end-of-treatment
RNA EF RNA Elongation Factor
Baseline histological Grading Baseline histological Grading
Baselinehistological staging Baselinehistological staging

#### a) We indicate the type each variable is (numerical, categorical, etc.).

In [3]:
data.dtypes

Age                                   int64
Gender                                int64
BMI                                   int64
Fever                                 int64
Nausea/Vomting                        int64
Headache                              int64
Diarrhea                              int64
Fatigue & generalized bone ache       int64
Jaundice                              int64
Epigastric pain                       int64
WBC                                   int64
RBC                                 float64
HGB                                   int64
Plat                                float64
AST 1                                 int64
ALT 1                                 int64
ALT4                                float64
ALT 12                                int64
ALT 24                                int64
ALT 36                                int64
ALT 48                                int64
ALT after 24 w                        int64
RNA Base                        

#### b) Columns name

In [4]:
data.columns

Index(['Age ', 'Gender', 'BMI', 'Fever', 'Nausea/Vomting', 'Headache ',
       'Diarrhea ', 'Fatigue & generalized bone ache ', 'Jaundice ',
       'Epigastric pain ', 'WBC', 'RBC', 'HGB', 'Plat', 'AST 1', 'ALT 1',
       'ALT4', 'ALT 12', 'ALT 24', 'ALT 36', 'ALT 48', 'ALT after 24 w',
       'RNA Base', 'RNA 4', 'RNA 12', 'RNA EOT', 'RNA EF',
       'Baseline histological Grading', 'Baselinehistological staging'],
      dtype='object')

#### c) Removing column blank spaces

In [5]:
data.columns = data.columns.str.replace(' ', '')
data.columns

Index(['Age', 'Gender', 'BMI', 'Fever', 'Nausea/Vomting', 'Headache',
       'Diarrhea', 'Fatigue&generalizedboneache', 'Jaundice', 'Epigastricpain',
       'WBC', 'RBC', 'HGB', 'Plat', 'AST1', 'ALT1', 'ALT4', 'ALT12', 'ALT24',
       'ALT36', 'ALT48', 'ALTafter24w', 'RNABase', 'RNA4', 'RNA12', 'RNAEOT',
       'RNAEF', 'BaselinehistologicalGrading', 'Baselinehistologicalstaging'],
      dtype='object')

#### d) Adding two categorical variables (from 'Baselinehistologicalstaging')
The new variable 'Outcome' indicates whether the patient has Cirrhosis or not (1 or 0) and 'Histologicalstaging' that indicates the histological stage of the disease.

In [6]:
category=pd.cut(data.Baselinehistologicalstaging, right=False
                ,bins=[0,1,2,3,4,5],labels=['NoFibrosis','PortalFibrosis','FewSepta','ManySepta','Cirrhosis'])
data.insert(29,'Histologicalstaging', category)
data.columns

Index(['Age', 'Gender', 'BMI', 'Fever', 'Nausea/Vomting', 'Headache',
       'Diarrhea', 'Fatigue&generalizedboneache', 'Jaundice', 'Epigastricpain',
       'WBC', 'RBC', 'HGB', 'Plat', 'AST1', 'ALT1', 'ALT4', 'ALT12', 'ALT24',
       'ALT36', 'ALT48', 'ALTafter24w', 'RNABase', 'RNA4', 'RNA12', 'RNAEOT',
       'RNAEF', 'BaselinehistologicalGrading', 'Baselinehistologicalstaging',
       'Histologicalstaging'],
      dtype='object')

In [7]:
new_var= np.where((data['Baselinehistologicalstaging'] < 4), 0,1)
data.insert(30,'Outcome', new_var) # In the 8th position.
data.columns

Index(['Age', 'Gender', 'BMI', 'Fever', 'Nausea/Vomting', 'Headache',
       'Diarrhea', 'Fatigue&generalizedboneache', 'Jaundice', 'Epigastricpain',
       'WBC', 'RBC', 'HGB', 'Plat', 'AST1', 'ALT1', 'ALT4', 'ALT12', 'ALT24',
       'ALT36', 'ALT48', 'ALTafter24w', 'RNABase', 'RNA4', 'RNA12', 'RNAEOT',
       'RNAEF', 'BaselinehistologicalGrading', 'Baselinehistologicalstaging',
       'Histologicalstaging', 'Outcome'],
      dtype='object')

In [8]:
data.dtypes

Age                               int64
Gender                            int64
BMI                               int64
Fever                             int64
Nausea/Vomting                    int64
Headache                          int64
Diarrhea                          int64
Fatigue&generalizedboneache       int64
Jaundice                          int64
Epigastricpain                    int64
WBC                               int64
RBC                             float64
HGB                               int64
Plat                            float64
AST1                              int64
ALT1                              int64
ALT4                            float64
ALT12                             int64
ALT24                             int64
ALT36                             int64
ALT48                             int64
ALTafter24w                       int64
RNABase                           int64
RNA4                              int64
RNA12                             int64


In [9]:
data.head()

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue&generalizedboneache,Jaundice,Epigastricpain,...,ALTafter24w,RNABase,RNA4,RNA12,RNAEOT,RNAEF,BaselinehistologicalGrading,Baselinehistologicalstaging,Histologicalstaging,Outcome
0,56,1,35,2,1,1,1,2,2,2,...,5,655330,634536,288194,5,5,13,2,FewSepta,0
1,46,1,29,1,2,2,1,2,2,1,...,44,40620,538635,637056,336804,31085,4,2,FewSepta,0
2,57,1,33,2,2,2,2,1,1,1,...,5,571148,661346,5,735945,558829,4,4,Cirrhosis,1
3,49,2,33,1,2,1,2,1,2,1,...,33,1041941,449939,585688,744463,582301,10,3,ManySepta,0
4,59,1,32,1,1,2,1,2,2,2,...,30,660410,738756,3731527,338946,242861,11,1,PortalFibrosis,0


## 3.- Descriptive Statistical Analysis

In [10]:
# Describe data
data.describe()

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue&generalizedboneache,Jaundice,Epigastricpain,...,ALT48,ALTafter24w,RNABase,RNA4,RNA12,RNAEOT,RNAEF,BaselinehistologicalGrading,Baselinehistologicalstaging,Outcome
count,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,...,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0
mean,46.319134,1.489531,28.608664,1.515523,1.502527,1.496029,1.502527,1.498917,1.501083,1.503971,...,83.629603,33.438267,590951.2,600895.6,288753.6,287660.336462,291378.290975,9.761733,2.536462,0.261372
std,8.781506,0.500071,4.076215,0.499939,0.500174,0.500165,0.500174,0.500179,0.500179,0.500165,...,26.223955,7.073569,353935.4,362315.1,285350.7,264559.52507,267700.691713,4.023896,1.121392,0.439541
min,32.0,1.0,22.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,5.0,11.0,5.0,5.0,5.0,5.0,3.0,1.0,0.0
25%,39.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,61.0,28.0,269253.0,270893.0,5.0,5.0,5.0,6.0,2.0,0.0
50%,46.0,1.0,29.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,...,83.0,34.0,593103.0,597869.0,234359.0,251376.0,244049.0,10.0,3.0,0.0
75%,54.0,2.0,32.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,106.0,40.0,886791.0,909093.0,524819.0,517806.0,527864.0,13.0,4.0,1.0
max,61.0,2.0,35.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,128.0,45.0,1201086.0,1201715.0,3731527.0,808450.0,810333.0,16.0,4.0,1.0


In [11]:
data.describe(include='all') #For categorical data.

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue&generalizedboneache,Jaundice,Epigastricpain,...,ALTafter24w,RNABase,RNA4,RNA12,RNAEOT,RNAEF,BaselinehistologicalGrading,Baselinehistologicalstaging,Histologicalstaging,Outcome
count,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,...,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385.0,1385,1385.0
unique,,,,,,,,,,,...,,,,,,,,,4,
top,,,,,,,,,,,...,,,,,,,,,Cirrhosis,
freq,,,,,,,,,,,...,,,,,,,,,362,
mean,46.319134,1.489531,28.608664,1.515523,1.502527,1.496029,1.502527,1.498917,1.501083,1.503971,...,33.438267,590951.2,600895.6,288753.6,287660.336462,291378.290975,9.761733,2.536462,,0.261372
std,8.781506,0.500071,4.076215,0.499939,0.500174,0.500165,0.500174,0.500179,0.500179,0.500165,...,7.073569,353935.4,362315.1,285350.7,264559.52507,267700.691713,4.023896,1.121392,,0.439541
min,32.0,1.0,22.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5.0,11.0,5.0,5.0,5.0,5.0,3.0,1.0,,0.0
25%,39.0,1.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,28.0,269253.0,270893.0,5.0,5.0,5.0,6.0,2.0,,0.0
50%,46.0,1.0,29.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,...,34.0,593103.0,597869.0,234359.0,251376.0,244049.0,10.0,3.0,,0.0
75%,54.0,2.0,32.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,40.0,886791.0,909093.0,524819.0,517806.0,527864.0,13.0,4.0,,1.0


In [12]:
# Number of entries in the dataset:
print(data.shape)

(1385, 31)


In [13]:
# Total number of unique values per variable
print(data.nunique())

Age                              30
Gender                            2
BMI                              14
Fever                             2
Nausea/Vomting                    2
Headache                          2
Diarrhea                          2
Fatigue&generalizedboneache       2
Jaundice                          2
Epigastricpain                    2
WBC                            1305
RBC                            1384
HGB                               6
Plat                           1375
AST1                             90
ALT1                             90
ALT4                             90
ALT12                            90
ALT24                            90
ALT36                            91
ALT48                            91
ALTafter24w                      25
RNABase                        1384
RNA4                           1384
RNA12                          1001
RNAEOT                         1002
RNAEF                          1004
BaselinehistologicalGrading 

In [14]:
data["Outcome"].value_counts() 

0    1023
1     362
Name: Outcome, dtype: int64

In [9]:
print("Hi!")

Testing
