In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
%matplotlib inline

In [2]:
data_dir = os.path.join('.', 'data')

In [3]:
def remove_parens(x):
    return x[1:-1]

In [4]:
def load_data(filename, index_col=None):
    return pd.read_csv(os.path.join(data_dir, filename), index_col=index_col, engine='c')

In [5]:
def load_drive():
    drive = load_data('drive.csv', index_col=['gsis_id', 'drive_id'])
    drive = drive.loc[drive.start_field.notna() & drive.start_field.notnull()]
    drive.start_field = drive.start_field.astype(str).apply(remove_parens).astype(int)
    drive.end_field = drive.end_field.astype(str).apply(remove_parens).astype(int)
    drive['ended_in_opposing_territory'] = drive.end_field > 0
    
    return drive

drive = load_drive()
drive.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,start_field,start_time,end_field,end_time,pos_team,pos_time,first_downs,result,penalty_yards,yards_gained,play_count,time_inserted,time_updated,ended_in_opposing_territory
gsis_id,drive_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2009081350,1,-30,"(Q1,0)",-11,"(Q1,239)",PIT,(239),1,Punt,4,15,6,2013-09-04 01:36:23.420121-04,2013-09-04 01:36:23.420121-04,False
2009081350,2,-35,"(Q1,239)",7,"(Q1,526)",ARI,(287),3,Punt,0,42,10,2013-09-04 01:36:23.420121-04,2013-09-04 01:36:23.420121-04,True
2009081350,3,-42,"(Q1,526)",-14,"(Q1,870)",PIT,(344),2,Punt,-5,33,8,2013-09-04 01:36:23.420121-04,2013-09-04 01:36:23.420121-04,False
2009081350,4,-29,"(Q1,870)",11,"(Q2,244)",ARI,(274),3,Punt,0,40,10,2013-09-04 01:36:23.420121-04,2013-09-04 01:36:23.420121-04,True
2009081350,5,-39,"(Q2,244)",18,"(Q2,437)",PIT,(193),1,Field Goal,-5,62,6,2013-09-04 01:36:23.420121-04,2013-09-04 01:36:23.420121-04,True


In [6]:
game = load_data('game.csv')
game['away_win'] = game.away_score > game.home_score
game['home_win'] = game.home_score > game.away_score
game['turnover_differential'] = game.home_turnovers - game.away_turnovers
game.head()

Unnamed: 0,gsis_id,gamekey,start_time,week,day_of_week,season_year,season_type,finished,home_team,home_score,...,away_score_q2,away_score_q3,away_score_q4,away_score_q5,away_turnovers,time_inserted,time_updated,away_win,home_win,turnover_differential
0,2014122106,56403,2014-12-21 13:00:00-05,16,Sunday,2014,Regular,t,PIT,20,...,3,0,6,0,0,2014-07-24 17:32:41.702371-04,2014-12-24 02:59:45.135816-05,False,True,0
1,2015102500,56595,2015-10-25 09:30:00-04,7,Sunday,2015,Regular,t,JAC,34,...,10,0,18,0,0,2015-10-25 19:28:34.570797-04,2015-10-28 02:59:49.506824-04,False,True,1
2,2014122107,56404,2014-12-21 13:00:00-05,16,Sunday,2014,Regular,t,TB,3,...,3,0,10,0,2,2014-07-24 17:32:41.702371-04,2014-12-24 02:59:45.135816-05,True,False,0
3,2014122108,56405,2014-12-21 16:05:00-05,16,Sunday,2014,Regular,t,STL,27,...,10,14,3,0,1,2014-07-24 17:32:41.702371-04,2014-12-24 02:59:45.135816-05,True,False,-1
4,2014122109,56406,2014-12-21 16:25:00-05,16,Sunday,2014,Regular,t,DAL,42,...,0,0,7,0,2,2014-07-24 17:32:41.702371-04,2014-12-24 02:59:45.135816-05,False,True,1


In [7]:
defense_point_columns = ['defense_int_tds', 'defense_misc_tds', 'defense_frec_tds', 'kickret_tds', 'defense_safe', 'kicking_rec_tds', 'puntret_tds']
offense_point_columns = ['rushing_tds', 'passing_tds', 'fumbles_rec_tds', 'kicking_fgm', 'kicking_xpmade', 'rushing_twoptm', 'receiving_twoptm', 'receiving_tds', 'passing_twoptm']
play_type_columns = ['rushing_att', 'passing_att', 'kicking_fga', 'kicking_xpa', 'punting_tot']

def _get_play_type(row):
    if row['passing_att'] == 1:
        return 'pass'
    elif row['rushing_att'] == 1:
        return 'rush'
    elif row['kicking_fga'] == 1:
        return 'field goal'
    elif row['punting_tot'] == 1:
        return 'punt'
    elif row['kicking_xpa'] == 1:
        return 'extra point'
    else:
        return 'unknown'

def load_agg_play():
    agg_play = load_data('agg_play.csv')
    
    for x in play_type_columns + defense_point_columns + offense_point_columns:
        agg_play[x] = agg_play[x].astype(int)
        
    agg_play = agg_play.loc[:, ['gsis_id', 'drive_id', 'play_id'] + play_type_columns + defense_point_columns + offense_point_columns]
    agg_play['play_type'] = agg_play.apply(_get_play_type, axis=1)
    
    return agg_play

agg_play = load_agg_play()
agg_play.head(25)

Unnamed: 0,gsis_id,drive_id,play_id,rushing_att,passing_att,kicking_fga,kicking_xpa,punting_tot,defense_int_tds,defense_misc_tds,...,rushing_tds,passing_tds,fumbles_rec_tds,kicking_fgm,kicking_xpmade,rushing_twoptm,receiving_twoptm,receiving_tds,passing_twoptm,play_type
0,2015092707,1,36,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
1,2015092703,1,36,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
2,2015092800,23,4074,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
3,2015092708,1,36,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
4,2015091000,4,836,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
5,2015101805,2,237,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pass
6,2015101805,12,2070,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rush
7,2015101806,1,132,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rush
8,2015101806,1,251,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,extra point
9,2015101804,1,200,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pass


In [8]:
drive_columns = ['gsis_id', 'drive_id', 'home_team', 'away_team', 'pos_team']
game_drive = pd.merge(game.loc[:, ['gsis_id', 'home_team', 'away_team']], drive.reset_index(), on=['gsis_id'])
agg_play = pd.merge(game_drive.loc[:, drive_columns], agg_play, on=['gsis_id', 'drive_id'])
agg_play = agg_play.sort_values(['gsis_id', 'play_id'])

grouped = agg_play.groupby(['gsis_id'])
rows = []
for name, group in grouped:
    away_score = 0
    home_score = 0
    
    last_was_td = False
    
    for index, g in group.iterrows():
        if g.pos_team == g.home_team:
            home_team_pos = True
        else:
            home_team_pos = False
        
        if g['defense_int_tds'] == 1 or g['defense_misc_tds'] == 1 or g['defense_frec_tds'] == 1 or g['kickret_tds'] == 1 or g['kicking_rec_tds'] == 1 or g['puntret_tds']:
            last_was_td = True
            if home_team_pos:
                away_score += 6
            else:
                home_score += 6
        elif g['rushing_tds'] == 1 or g['passing_tds'] == 1 or g['fumbles_rec_tds'] == 1 or g['receiving_tds']:
            last_was_td = True
            if home_team_pos:
                home_score += 6
            else:
                away_score += 6
        elif g['kicking_fgm'] == 1:
            last_was_td = False
            if home_team_pos:
                home_score += 3
            else:
                away_score += 3
        elif g['kicking_xpmade'] == 1:
            if home_team_pos:
                home_score += 1
            else:
                away_score += 1
            if not last_was_td:
                if home_team_pos:
                    home_score += 6
                else:
                    away_score += 6
            last_was_td = False
        elif g['defense_safe'] == 1:
            last_was_td = False
            if home_team_pos:
                home_score += 1
            else:
                away_score += 1
        elif g['passing_twoptm'] or g['receiving_twoptm'] == 1 or g['rushing_twoptm'] == 1:
            last_was_td = False
            if home_team_pos:
                home_score += 2
            else:
                away_score += 2
        
        g['home_score'] = home_score
        g['away_score'] = away_score
        
        rows.append(g)
                
    
plays_with_scores = pd.DataFrame(rows)

In [9]:
plays_with_scores.head()

Unnamed: 0,gsis_id,drive_id,home_team,away_team,pos_team,play_id,rushing_att,passing_att,kicking_fga,kicking_xpa,...,fumbles_rec_tds,kicking_fgm,kicking_xpmade,rushing_twoptm,receiving_twoptm,receiving_tds,passing_twoptm,play_type,home_score,away_score
6790,2009080950,1,TEN,BUF,TEN,37,0,0,0,0,...,0,0,0,0,0,0,0,unknown,0,0
6791,2009080950,1,TEN,BUF,TEN,59,0,1,0,0,...,0,0,0,0,0,0,0,pass,0,0
6792,2009080950,1,TEN,BUF,TEN,83,1,0,0,0,...,0,0,0,0,0,0,0,rush,0,0
6793,2009080950,1,TEN,BUF,TEN,104,0,1,0,0,...,0,0,0,0,0,0,0,pass,0,0
6794,2009080950,1,TEN,BUF,TEN,128,0,1,0,0,...,0,0,0,0,0,0,0,pass,0,0


In [10]:
def select_accurately_scored_games():
    validation = pd.merge(game.loc[:, ['gsis_id', 'season_type', 'home_score', 'away_score']], plays_with_scores, left_on=['gsis_id'], right_on=['gsis_id'], suffixes=['_game', '_play'])
    validation = validation.sort_values(['gsis_id', 'drive_id', 'play_id'])
    vg = validation.groupby(['gsis_id'])
    gsis_id = vg['gsis_id'].first()
    home_score_game = vg['home_score_game'].first()
    home_score_play = vg['home_score_play'].last()
    away_score_game = vg['away_score_game'].first()
    away_score_play = vg['away_score_play'].last()

    compare_df = pd.concat([gsis_id, home_score_game, home_score_play, away_score_game, away_score_play], axis=1)
    same = compare_df.loc[(compare_df.home_score_game == compare_df.home_score_play) & (compare_df.away_score_game == compare_df.away_score_play)]
    different = compare_df.loc[~(compare_df.home_score_game == compare_df.home_score_play) | ~(compare_df.away_score_game == compare_df.away_score_play)]
    
    return same, different

In [11]:
accurate_games, incorrect_games = select_accurately_scored_games()
{
    'accurate_games_size': accurate_games.size,
    'incorrect_games_size': incorrect_games.size
}

{'accurate_games_size': 7355, 'incorrect_games_size': 4550}

In [12]:
accurate_games.head()

Unnamed: 0_level_0,gsis_id,home_score_game,home_score_play,away_score_game,away_score_play
gsis_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009081350,2009081350,20,20,10,10
2009081352,2009081352,23,23,0,0
2009081353,2009081353,31,31,10,10
2009081450,2009081450,20,20,23,23
2009081451,2009081451,3,3,13,13


In [13]:
# Save incorrect games to differences.csv
pd.merge(incorrect_games.loc[:, ['gsis_id', 'home_score_game', 'away_score_game']], plays_with_scores, on='gsis_id').to_csv('data/differences.csv')

In [14]:
def load_play():
    df = load_data('play.csv')
    df.time = df.time.apply(remove_parens).apply(lambda x: x.split(','))
    df[['quarter', 'clock']] = df.time.apply(pd.Series)
    
    df = df.loc[df.quarter.isin(['Q1', 'Q2', 'Q3', 'Q4'])]
    df.quarter = df.quarter.apply(lambda x: x[1:])
    df.quarter = df.quarter.astype(int)
    
    df = df.loc[df.yardline.notna()]
    df.yardline = df.yardline.astype(str).apply(remove_parens)
    df.yardline = df.yardline.astype(int)
    
    df = df.loc[df.down.notna()]
    df.down = df.down.astype(int)
    
    df = df.loc[:, ['gsis_id', 'drive_id', 'play_id', 'pos_team', 'quarter', 'clock', 'down', 'yards_to_go', 'yardline']]
    
    return df

play = load_play()
play.head()

Unnamed: 0,gsis_id,drive_id,play_id,pos_team,quarter,clock,down,yards_to_go,yardline
1,2009081350,1,59,PIT,1,0,1,10,-30
2,2009081350,1,80,PIT,1,39,2,11,-31
3,2009081350,1,104,PIT,1,82,3,5,-25
5,2009081350,1,144,PIT,1,96,3,1,-21
6,2009081350,1,169,PIT,1,139,1,10,-18


In [22]:
# Merge accurate games with play data
games_to_analyze_columns_to_keep = ['gsis_id', 'drive_id', 'play_id', 'home_team', 'away_team', 'pos_team', 'home_score', 'away_score', 'play_type']
games_to_analyze = pd.merge(accurate_games.loc[:, ['gsis_id']], plays_with_scores, on=['gsis_id'], how='inner')
games_to_analyze = pd.merge(games_to_analyze.loc[:, games_to_analyze_columns_to_keep], play, on=['gsis_id', 'drive_id', 'play_id'])
games_to_analyze['pos_team'] = games_to_analyze.pos_team_y
games_to_analyze = games_to_analyze.drop(['pos_team_x', 'pos_team_y'], axis=1)
games_to_analyze = games_to_analyze.loc[games_to_analyze.play_type != 'unknown']
games_to_analyze.head()

Unnamed: 0,gsis_id,drive_id,play_id,home_team,away_team,home_score,away_score,play_type,quarter,clock,down,yards_to_go,yardline,pos_team
0,2009081350,1,59,PIT,ARI,0,0,rush,1,0,1,10,-30,PIT
1,2009081350,1,80,PIT,ARI,0,0,pass,1,39,2,11,-31,PIT
3,2009081350,1,144,PIT,ARI,0,0,rush,1,96,3,1,-21,PIT
4,2009081350,1,169,PIT,ARI,0,0,pass,1,139,1,10,-18,PIT
5,2009081350,1,193,PIT,ARI,0,0,rush,1,181,2,7,-15,PIT


In [23]:
games_to_analyze.to_csv(os.path.join('.', 'data', 'play_prediction.csv'), index=False)

In [18]:
ne_games = games_to_analyze.loc[(games_to_analyze.home_team == 'NE') | (games_to_analyze.away_team == 'NE')]

def point_differential(row):
    ne_is_home = row['home_team'] == 'NE'
    if ne_is_home:
        return row.home_score - row.away_score
    else:
        return row.away_score - row.home_score

ne_games['point_differential'] = ne_games.apply(point_differential, axis=1)
ne_games.to_csv(os.path.join('.', 'data', 'ne_play_predictions.csv'), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [None]:
def two_component_pca(features):
    pca = PCA(n_components=2)
    fitted = pca.fit(features)
    principal_components = fitted.transform(features)
    print(features.shape)
    print(principal_components.shape)
    explained_variance_ratio = fitted.explained_variance_ratio_
    return principal_components, explained_variance_ratio

In [None]:
def df_pca(df, feature_names, label_name, title=None):
    if title is None:
        title = ', '.join(feature_names)
    
    df = df.copy()
    x = StandardScaler().fit_transform(df.loc[:, feature_names])
    y = df.loc[:, label_name]
    
    principal, explained_variance_ratio = two_component_pca(x)
    
    fig = plt.figure(figsize = (8,8))
    #ax = fig.add_subplot(1,1,1) 
    #ax.set_xlabel('P1 (%2f)' % explained_variance_ratio[0], fontsize = 15)
    #ax.set_ylabel('P2 (%2f)' % explained_variance_ratio[1], fontsize = 15)
    #ax.set_title(title, fontsize = 20)

    #ax.scatter(principal[:, 0], principal[:, 1], c = y, cmap=plt.cm.get_cmap('spectral', 2))
    #ax.scatter(principal[y == 1, 0], principal[y == 1, 1], c = 'b', s = 10)
    plt.xlabel('P1 (%2f)' % explained_variance_ratio[0], fontsize = 15)
    plt.ylabel('P2 (%2f)' % explained_variance_ratio[1], fontsize = 15)
    plt.scatter(principal[:, 0], principal[:, 1], c = y, s=10, cmap=plt.cm.get_cmap('coolwarm', 2))
    plt.colorbar()
    plt.grid()

In [None]:
two_component_pca(df.loc[:, ['home_score', 'away_score', 'quarter', 'clock', 'down', 'yards_to_go', 'yardline']])

In [None]:
def save_agg_play_instance(game_id):
    agg_play = load_agg_play()
    agg_play.loc[agg_play.gsis_id == game_id].to_csv('data/game_inst.csv')
    play_inst = load_data('play.csv')
    play_inst = play_inst.loc[play_inst.gsis_id == game_id]
    play_inst = play_inst.sort_values(['drive_id', 'play_id'])
    play_inst.reset_index().to_csv('data/play_inst.csv')

save_agg_play_instance(2009082953)

In [None]:
team = load_data('team.csv')
team_ids = team.team_id
team_nums = range(0, len(team_ids))

team_id_to_team_num = {team_id: team_num for (team_num, team_id) in zip(team_nums, team_ids)}


In [None]:
game_drive = game.merge(drive, left_on=['gsis_id', 'home_team'], right_on=['gsis_id', 'pos_team'])

game_drive['home_team_num'] = game_drive.home_team.apply(lambda x: team_id_to_team_num[x])
game_drive['away_team_num'] = game_drive.away_team.apply(lambda x: team_id_to_team_num[x])

grouped_game_drive = game_drive.groupby(['gsis_id', 'home_team'])
week = grouped_game_drive.week.first()
season_year = grouped_game_drive.season_year.first()
mean_start_field = grouped_game_drive.start_field.mean()
penalty_yards = grouped_game_drive.penalty_yards.sum()
turnover_differential = grouped_game_drive.turnover_differential.first()
yards_gained = grouped_game_drive.yards_gained.sum()
home_win = grouped_game_drive.home_win.first()
home_team = grouped_game_drive.home_team_num.first()
away_team = grouped_game_drive.away_team_num.first()
day_of_week = grouped_game_drive.day_of_week.first()

numeric_days = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}

agg_game_drive = pd.concat([home_team, away_team, season_year, week, day_of_week, mean_start_field, penalty_yards, yards_gained, turnover_differential, home_win], axis=1)
agg_game_drive.home_win = agg_game_drive.home_win.astype(int)
agg_game_drive.day_of_week = agg_game_drive.day_of_week.apply(lambda x: numeric_days[x])
agg_game_drive.head()

# PCA on all features

With all features plotted we see that the divergence of win/loss is not well-explained by this dataset, as expected. There are many
features that have nothing to do with a win such as day of week, week of season, year of season.

The P1 and P2 explanation ratios are quite low, both being less than 20%. This indicates that much of the data is irrelevant, as expected.

In [None]:
features = [name for name in agg_game_drive if name != 'home_win']
df_pca(agg_game_drive, features, 'home_win', title='Field Position, Penalty Yards, and Turnovers')

# Removing the obvious

After removing season year, week, and day of week we see that the data much better explains a win or loss. The P1 and P2 explanation ratios are much higher with just these three features explaining 74% of the variance in data.

In [None]:
df_pca(agg_game_drive, ['start_field', 'penalty_yards', 'turnover_differential'], 'home_win', title='Field Position and Turnovers')

# The Case of San Diego

Over the last decade, the chargers have been a medicore team in terms of win/loss record. In this case, the PCA shows that the three main features we have identified show a greater separation. This is because with the teams at the top and at the bottom, they win or lose despite the stats. A team like the New England Patriots will find a way to win a lost game. A team like San Diego is not of the same quality. If the stats are against them, then they lose. If the stats are with them, then they win. 

In [None]:
df_pca(agg_game_drive.query("home_team == 'SD'"), ['start_field', 'penalty_yards', 'turnover_differential'], 'home_win', title='New England Field Position and Turnovers')

# Predicting the Future

Projecting future outcomes based on a sequence of data is a commonly desirable task.

In [None]:
def load_play():
    play = load_data('play.csv', index_col=['gsis_id', 'drive_id', 'play_id'])
    #play = play.loc[play.pos_team == 'NE']
    #play = play.loc[play.yardline.notnull() & play.down.notna()]
    
    # Half and Final are recorded with Unknown team.
    # Remove these "plays".
    play = play.loc[play.pos_team != 'UNK']
    
    # Remove plays that don't have a down
    play = play.loc[play.down.notnull()]
    
    # Time is given as a tuple. Split it into quarter and clock.
    play.time = play.time.apply(remove_parens).apply(lambda x: x.split(','))
    play[['quarter', 'clock']] = play.time.apply(pd.Series)
    
    # Remove overtime plays.
    # Overtime is a special situation. We do not want to predict on overtime.
    play = play.loc[~play.quarter.isin(['OT', 'OT1', 'OT2', 'OT3', 'OT4', 'OT5', 'OT6'])]
    
    play.quarter = play.quarter.apply(lambda x: 5 if x == 'OT' else x[1:]).astype(int)
    play.down = play.down.astype(int)
    play.yardline = play.yardline.astype(str).apply(remove_parens).astype(int)
    
    return play

play = load_play()
play.head()


In [None]:
drive.head()    

In [None]:
play_agg_play = play.join(agg_play, how='inner', rsuffix='_ap')
play_agg_play = play_agg_play.loc[play_agg_play.kicking_xpa == 0] # Remove extra point attempts
play_agg_play = play_agg_play.loc[:, ['drive_id', 'quarter', 'clock', 'down', 'yardline', 'yards_to_go', 'rushing_att', 'passing_att', 'kicking_fga', 'punting_tot']]
play_agg_play = play_agg_play.loc[(play_agg_play.rushing_att == 1) | (play_agg_play.passing_att == 1) |  (play_agg_play.kicking_fga == 1) | (play_agg_play.punting_tot == 1)]
play_agg_play['play_type'] = play_agg_play.apply(get_play_type, axis=1)
play_agg_play = play_agg_play.drop(['rushing_att', 'passing_att', 'kicking_fga', 'punting_tot'], axis=1)
play_agg_play.head()

In [None]:
play = load_data('play.csv')
drive = load_data('drive.csv')
pd.merge(play.reset_index(), drive, on=['gsis_id', 'drive_id']).set_index(['gsis_id', 'drive_id', 'play_id'])

In [None]:
pd.merge(play_agg_play.reset_index(), drive.loc[:, ['gsis_id', 'drive_id', 'pos_team', 'result']], on=['gsis_id'], suffixes=('_play', '_drive')).sort_values(by=['gsis_id', 'drive_id_drive', 'quarter', 'clock'])

In [None]:
df = pd.read_csv(os.path.join('.', 'data', 'top_qbs_passing_tds.csv'))
df['heightweight'] = df.height / df.weight
df.head()

In [None]:
from sklearn import datasets
import pylab as pl
iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components):', pca.explained_variance_ratio_)

pl.figure()
for c, i, target_name in zip("rgb", [0, 1, 2], target_names):
    pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name)
pl.legend()
pl.title('PCA of IRIS dataset')

pl.show()