# Introduction to Q4: Gibbs Sampler


In [2]:
# APML - Advanced probabilistic machine learning Project
# Preprecessing
# prepare the dataset
import pandas as pd

def preprocess_serieA_no_draws(filepath='SerieA_dataset.csv'):
    """
    Load and preprocess the Serie A dataset.

    Parameters:
    - filepath (str): Path to the dataset.

    Returns:
    - games_df (pd.DataFrame): Processed dataframe without draw games.
    """

    # Load dataset Serie A
    games_df = pd.read_csv(filepath)
    print(f'Shape of the games dataset: {games_df.shape}')
    print(f'first 3 rows:\n{games_df.head(3)}')
    print('\nDescription of the games dataset:') 
    print(games_df.describe())

    # Remove draw games
    draw_games_df = games_df[games_df['score1'] == games_df['score2']]
    games_df = games_df.drop(draw_games_df.index)
    print(f'\nShape of the draw games dataset: {draw_games_df.shape}')
    print(f'first 3 rows of draw games:\n{draw_games_df.head(3)}')
    print(f'\nShape of the games dataset after removing draw games: {games_df.shape}')

    # Create a new column with the result of the game
    games_df['y'] = games_df.apply(lambda row: 1 if row['score1'] > row['score2'] else -1, axis=1)
    print(f'first 3 rows of no-draw games:\n{games_df.head(3)}')

    # Remove unnecessary columns (score1, score2, HH:MM, yyyy-mm-dd)
    games_df = games_df.drop(['score1', 'score2', 'HH:MM', 'yyyy-mm-dd'], axis=1)
    print(f'\nShape of the games dataset after removing unnecessary columns: {games_df.shape}')
    print(f'first 3 rows of no-draw games:\n{games_df.head(3)}')

    return games_df





# Optional: If you want to run the function and see its result directly from this script
if __name__ == "__main__":
    df = preprocess_serieA_no_draws()


Shape of the games dataset: (380, 6)
first 3 rows:
   yyyy-mm-dd  HH:MM   team1     team2  score1  score2
0  2018-08-18  18:00  Chievo  Juventus       2       3
1  2018-08-18  20:30   Lazio    Napoli       1       2
2  2018-08-19  18:00  Torino      Roma       0       1

Description of the games dataset:
           score1      score2
count  380.000000  380.000000
mean     1.484211    1.197368
std      1.196857    1.155687
min      0.000000    0.000000
25%      1.000000    0.000000
50%      1.000000    1.000000
75%      2.000000    2.000000
max      6.000000    6.000000

Shape of the draw games dataset: (108, 6)
first 3 rows of draw games:
    yyyy-mm-dd  HH:MM      team1    team2  score1  score2
4   2018-08-19  20:30      Parma  Udinese       2       2
12  2018-08-26  20:30      Inter   Torino       2       2
14  2018-08-26  20:30  Frosinone  Bologna       0       0

Shape of the games dataset after removing draw games: (272, 6)
first 3 rows of no-draw games:
   yyyy-mm-dd  HH:MM   tea