### PCA - Analisis de componentes principales

link referencia: https://www.cienciadedatos.net/documentos/py19-pca-python.html#:~:text=El%20an%C3%A1lisis%20de%20componentes%20principales,vez%20que%20conserva%20su%20informaci%C3%B3n.

#### Carga de datos

In [2]:
# Importamos librerias a utilizar
import pandas as pd
import numpy as np
#Graficos
# ==============================================================================
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager
from matplotlib import style
style.use('ggplot') or plt.style.use('ggplot')
import seaborn as sns
#PCA
# ==============================================================================
import statsmodels.api as sm
# Preprocesado y modelado
# ==============================================================================
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Lectura del csv
df_air = pd.read_csv('Airlines.csv', sep=',')
df_air.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0


In [4]:
# Lectura del csv
df_airport = pd.read_csv('Airports.csv', sep=';', on_bad_lines='skip')
df_airport.head()

Unnamed: 0,Airport,Cod_Airport,Desc_Airport,Loc
0,ATL - Hartsfield-Jackson Atlanta International...,ATL,Hartsfield-Jackson Atlanta International Airport,Georgia
1,AUS - Austin-Bergstrom International Airport -...,AUS,Austin-Bergstrom International Airport,Texas
2,BNA - Nashville International Airport - Tennessee,BNA,Nashville International Airport,Tennessee
3,BOS - Boston Logan International Airport - Mas...,BOS,Boston Logan International Airport,Massachusetts
4,BWI - Baltimore-Washington International Thurg...,BWI,Baltimore-Washington International Thurgood Ma...,Washington


In [5]:
df_airport.shape

(40, 4)

In [6]:
df_air.drop(df_air[df_air.Length == 0].index, inplace=True)
df_air.shape

(539379, 9)

#### Transformacion de los datos

In [7]:
df_air.loc[df_air['Time'] <= 500, 'TimeGroups'] = '<= 500'
df_air.loc[df_air['Time'].between(500, 801, inclusive='neither'), 'TimeGroups'] = '501 - 800'
df_air.loc[df_air['Time'].between(800, 1101, inclusive='neither'), 'TimeGroups'] = '801 - 1100'
df_air.loc[df_air['Time'] > 1100, 'TimeGroups'] = '> 1100'

df_air

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,TimeGroups
0,1,CO,269,SFO,IAH,3,15,205,1,<= 500
1,2,US,1558,PHX,CLT,3,15,222,1,<= 500
2,3,AA,2400,LAX,DFW,3,20,165,1,<= 500
3,4,AA,2466,SFO,DFW,3,20,195,1,<= 500
4,5,AS,108,ANC,SEA,3,30,202,0,<= 500
...,...,...,...,...,...,...,...,...,...,...
539378,539379,CO,178,OGG,SNA,5,1439,326,0,> 1100
539379,539380,FL,398,SEA,ATL,5,1439,305,0,> 1100
539380,539381,FL,609,SFO,MKE,5,1439,255,0,> 1100
539381,539382,UA,78,HNL,SFO,5,1439,313,1,> 1100


In [8]:
df_air.loc[df_air['Length'] <= 80, 'LengthGroups'] = '<= 80'
df_air.loc[df_air['Length'].between(80, 141, inclusive='neither'),  'LengthGroups'] = '81 - 140'
df_air.loc[df_air['Length'].between(140, 201, inclusive='neither'),  'LengthGroups'] = '141 - 200'
df_air.loc[df_air['Length'] > 200, 'LengthGroups'] = '> 200'
df_air

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,TimeGroups,LengthGroups
0,1,CO,269,SFO,IAH,3,15,205,1,<= 500,> 200
1,2,US,1558,PHX,CLT,3,15,222,1,<= 500,> 200
2,3,AA,2400,LAX,DFW,3,20,165,1,<= 500,141 - 200
3,4,AA,2466,SFO,DFW,3,20,195,1,<= 500,141 - 200
4,5,AS,108,ANC,SEA,3,30,202,0,<= 500,> 200
...,...,...,...,...,...,...,...,...,...,...,...
539378,539379,CO,178,OGG,SNA,5,1439,326,0,> 1100,> 200
539379,539380,FL,398,SEA,ATL,5,1439,305,0,> 1100,> 200
539380,539381,FL,609,SFO,MKE,5,1439,255,0,> 1100,> 200
539381,539382,UA,78,HNL,SFO,5,1439,313,1,> 1100,> 200


In [9]:
#Fusionamos Datasets
df = pd.merge(df_air, df_airport, how='left', left_on='AirportFrom', right_on='Cod_Airport')
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,TimeGroups,LengthGroups,Airport,Cod_Airport,Desc_Airport,Loc
0,1,CO,269,SFO,IAH,3,15,205,1,<= 500,> 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California
1,2,US,1558,PHX,CLT,3,15,222,1,<= 500,> 200,PHX - Phoenix Sky Harbor International Airport...,PHX,Phoenix Sky Harbor International Airport,Arizona
2,3,AA,2400,LAX,DFW,3,20,165,1,<= 500,141 - 200,LAX - Los Angeles International Airport - Cali...,LAX,Los Angeles International Airport,California
3,4,AA,2466,SFO,DFW,3,20,195,1,<= 500,141 - 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California
4,5,AS,108,ANC,SEA,3,30,202,0,<= 500,> 200,,,,


In [10]:
df.rename(columns={"Airport":"Airport_From", "Cod_Airport":"Cod_AirportFrom", "Desc_Airport":"Desc_AirportFrom", "Loc":"Loc_From"}, inplace=True)
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,TimeGroups,LengthGroups,Airport_From,Cod_AirportFrom,Desc_AirportFrom,Loc_From
0,1,CO,269,SFO,IAH,3,15,205,1,<= 500,> 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California
1,2,US,1558,PHX,CLT,3,15,222,1,<= 500,> 200,PHX - Phoenix Sky Harbor International Airport...,PHX,Phoenix Sky Harbor International Airport,Arizona
2,3,AA,2400,LAX,DFW,3,20,165,1,<= 500,141 - 200,LAX - Los Angeles International Airport - Cali...,LAX,Los Angeles International Airport,California
3,4,AA,2466,SFO,DFW,3,20,195,1,<= 500,141 - 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California
4,5,AS,108,ANC,SEA,3,30,202,0,<= 500,> 200,,,,


In [11]:
df = pd.merge(df, df_airport, how='left', left_on='AirportTo', right_on='Cod_Airport')
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,TimeGroups,LengthGroups,Airport_From,Cod_AirportFrom,Desc_AirportFrom,Loc_From,Airport,Cod_Airport,Desc_Airport,Loc
0,1,CO,269,SFO,IAH,3,15,205,1,<= 500,> 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California,IAH - George Bush Intercontinental Airport - H...,IAH,George Bush Intercontinental Airport,"Houston, Texas"
1,2,US,1558,PHX,CLT,3,15,222,1,<= 500,> 200,PHX - Phoenix Sky Harbor International Airport...,PHX,Phoenix Sky Harbor International Airport,Arizona,CLT - Charlotte Douglas International Airport ...,CLT,Charlotte Douglas International Airport,North Carolina
2,3,AA,2400,LAX,DFW,3,20,165,1,<= 500,141 - 200,LAX - Los Angeles International Airport - Cali...,LAX,Los Angeles International Airport,California,DFW - Dallas/Fort Worth International Airport ...,DFW,Dallas/Fort Worth International Airport,Texas
3,4,AA,2466,SFO,DFW,3,20,195,1,<= 500,141 - 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California,DFW - Dallas/Fort Worth International Airport ...,DFW,Dallas/Fort Worth International Airport,Texas
4,5,AS,108,ANC,SEA,3,30,202,0,<= 500,> 200,,,,,SEA - Seattle–Tacoma International Airport - W...,SEA,Seattle–Tacoma International Airport,Washington


In [12]:
df.rename(columns={"Airport":"Airport_To", "Cod_Airport":"Cod_AirportTo", "Desc_Airport":"Desc_AirportTo", "Loc":"Loc_To"}, inplace=True)
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,TimeGroups,LengthGroups,Airport_From,Cod_AirportFrom,Desc_AirportFrom,Loc_From,Airport_To,Cod_AirportTo,Desc_AirportTo,Loc_To
0,1,CO,269,SFO,IAH,3,15,205,1,<= 500,> 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California,IAH - George Bush Intercontinental Airport - H...,IAH,George Bush Intercontinental Airport,"Houston, Texas"
1,2,US,1558,PHX,CLT,3,15,222,1,<= 500,> 200,PHX - Phoenix Sky Harbor International Airport...,PHX,Phoenix Sky Harbor International Airport,Arizona,CLT - Charlotte Douglas International Airport ...,CLT,Charlotte Douglas International Airport,North Carolina
2,3,AA,2400,LAX,DFW,3,20,165,1,<= 500,141 - 200,LAX - Los Angeles International Airport - Cali...,LAX,Los Angeles International Airport,California,DFW - Dallas/Fort Worth International Airport ...,DFW,Dallas/Fort Worth International Airport,Texas
3,4,AA,2466,SFO,DFW,3,20,195,1,<= 500,141 - 200,SFO - San Francisco International Airport - Ca...,SFO,San Francisco International Airport,California,DFW - Dallas/Fort Worth International Airport ...,DFW,Dallas/Fort Worth International Airport,Texas
4,5,AS,108,ANC,SEA,3,30,202,0,<= 500,> 200,,,,,SEA - Seattle–Tacoma International Airport - W...,SEA,Seattle–Tacoma International Airport,Washington


In [13]:
df['Cod_AirportFrom'].fillna("Other", inplace=True)
df['Cod_AirportTo'].fillna("Other", inplace=True)
df['Loc_From'].fillna("Other", inplace=True)
df['Loc_To'].fillna("Other", inplace=True)

#### Exploracion inicial

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 539379 entries, 0 to 539378
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                539379 non-null  int64 
 1   Airline           539379 non-null  object
 2   Flight            539379 non-null  int64 
 3   AirportFrom       539379 non-null  object
 4   AirportTo         539379 non-null  object
 5   DayOfWeek         539379 non-null  int64 
 6   Time              539379 non-null  int64 
 7   Length            539379 non-null  int64 
 8   Delay             539379 non-null  int64 
 9   TimeGroups        539379 non-null  object
 10  LengthGroups      539379 non-null  object
 11  Airport_From      383654 non-null  object
 12  Cod_AirportFrom   539379 non-null  object
 13  Desc_AirportFrom  383654 non-null  object
 14  Loc_From          539379 non-null  object
 15  Airport_To        383670 non-null  object
 16  Cod_AirportTo     539379 non-null  obj

In [16]:
print('----------------------')
print('Media de cada variable')
print('----------------------')
df.mean(axis=0, numeric_only=True)

----------------------
Media de cada variable
----------------------


id           269692.152479
Flight         2427.945135
DayOfWeek         3.929649
Time            802.729500
Length          132.202987
Delay             0.445444
dtype: float64

In [18]:
print('-------------------------')
print('Varianza de cada variable')
print('-------------------------')
df.var(axis=0, numeric_only=True)

-------------------------
Varianza de cada variable
-------------------------


id           2.424463e+10
Flight       4.274261e+06
DayOfWeek    3.665916e+00
Time         7.730958e+04
Length       4.916303e+03
Delay        2.470241e-01
dtype: float64

#### Modelo PCA

In [19]:
# Entrenamiento modelo PCA con escalado de los datos
# ==============================================================================
pca_pipe = make_pipeline(StandardScaler(), PCA())
pca_pipe.fit(df)

# Se extrae el modelo entrenado del pipeline
modelo_pca = pca_pipe.named_steps['pca']

ValueError: could not convert string to float: 'CO'