In [44]:
import pandas as pd
import numpy as np

In [46]:
df = pd.read_csv(r"D:\ETL_Football\data\fbref_fact_player_season_stats.csv", header=[0,1,2])

# in để kiểm tra chính xác các tuple column names nếu cần
print(df.columns.tolist())

# Chọn cột cần thiết bằng cách match 2 level đầu (level 3 là giá trị dữ liệu)
player_col = [col for col in df.columns if col[0] == 'player' and col[1] == 'Unnamed: 3_level_1'][0]
pos_col = [col for col in df.columns if col[0] == 'pos' and col[1] == 'Unnamed: 5_level_1'][0]
nation_col = [col for col in df.columns if col[0] == 'nation' and col[1] == 'Unnamed: 4_level_1'][0]
born_col = [col for col in df.columns if col[0] == 'born' and col[1] == 'Unnamed: 7_level_1'][0]

df_subset = df[[player_col, pos_col, nation_col, born_col]].copy()

# đặt tên cột dễ dùng
df_subset.columns = ['player', 'pos', 'nation', 'born']

# loại trùng theo tên player, giữ bản ghi đầu tiên
df_subset = df_subset.drop_duplicates(subset='player', keep='first')

# thiết lập lại index để player_id liên tiếp
df_subset = df_subset.reset_index(drop=True)

# tạo player_id tăng dần từ 1
df_subset['player_id'] = np.arange(len(df_subset)) + 1

df_subset['born'] = df_subset['born'].astype('Int64')

# nếu muốn player_id là cột đầu
df_subset = df_subset[['player_id', 'player', 'pos', 'nation', 'born']]

# lưu ra file
df_subset.to_csv(r"D:\ETL_Football\data_processed\dim_player.csv", index=False)

[('league', 'Unnamed: 0_level_1', 'ENG-Premier League'), ('season', 'Unnamed: 1_level_1', '2021'), ('team', 'Unnamed: 2_level_1', 'Arsenal'), ('player', 'Unnamed: 3_level_1', 'Ainsley Maitland-Niles'), ('nation', 'Unnamed: 4_level_1', 'ENG'), ('pos', 'Unnamed: 5_level_1', 'MF,DF'), ('age', 'Unnamed: 6_level_1', '22'), ('born', 'Unnamed: 7_level_1', '1997'), ('Playing Time', 'MP', '11'), ('Playing Time', 'Starts', '5'), ('Playing Time', 'Min', '490'), ('Playing Time', '90s', '5.4'), ('Performance', 'Gls', '0'), ('Performance', 'Ast', '0'), ('Performance', 'G+A', '0'), ('Performance', 'G-PK', '0'), ('Performance', 'PK', '0'), ('Performance', 'PKatt', '0'), ('Performance', 'CrdY', '0'), ('Performance', 'CrdR', '0'), ('Expected', 'xG', '0.1'), ('Expected', 'npxG', '0.1'), ('Expected', 'xAG', '0.1'), ('Expected', 'npxG+xAG', '0.2'), ('Progression', 'PrgC', '12'), ('Progression', 'PrgP', '24'), ('Progression', 'PrgR', '21'), ('Per 90 Minutes', 'Gls', '0'), ('Per 90 Minutes', 'Ast', '0'), ('P

In [47]:
# dữ liệu dim_team
df = pd.read_csv(r"D:\ETL_Football\data\dim_team.csv")
print(df.columns.tolist())

# Loại bỏ dòng header trùng lặp (nếu có)
header_row = list(df.columns)
df = df[~df.apply(lambda row: list(row.values) == header_row, axis=1)].reset_index(drop=True)

df_subset = df[['club_id', 'club_label', 'founding_year', 'venue_id']].copy()
df_subset.columns = ['team_id', 'team_name', 'founded_year', 'stadium_id']

name_map = {
    "AFC Bournemouth": "BOU",
    "Arsenal F.C.": "ARS",
    "Aston Villa F.C.": "AVL",
    "Brentford F.C.": "BRE",
    "Brighton & Hove Albion F.C.": "BHA",
    "Chelsea F.C.": "CHE",
    "Crystal Palace F.C.": "CRY",
    "Everton F.C.": "EVE",
    "Fulham F.C.": "FUL",
    "Ipswich Town F.C.": "IPS",
    "Leicester City F.C.": "LEI",
    "Liverpool F.C.": "LIV",
    "Manchester City F.C.": "MCI",
    "Manchester United F.C.": "MUN",
    "Newcastle United F.C.": "NEW",
    "Nottingham Forest F.C.": "NOT",
    "Southampton F.C.": "SOU",
    "Tottenham Hotspur F.C.": "TOT",
    "West Ham United F.C.": "WHU",
    "Wolverhampton Wanderers F.C.": "WOL",
    "Blackburn Rovers F.C.": "BLA",
    "Bristol City F.C.": "BRC",
    "Burnley F.C.": "BUR",
    "Cardiff City F.C.": "CAR",
    "Coventry City F.C.": "COV",
    "Derby County F.C.": "DER",
    "Hull City A.F.C.": "HUL",
    "Leeds United F.C.": "LEE",
    "Luton Town F.C.": "LUT",
    "Middlesbrough F.C.": "MID",
    "Millwall F.C.": "MIL",
    "Norwich City F.C.": "NOR",
    "Oxford United F.C.": "OXF",
    "Plymouth Argyle F.C.": "PLY",
    "Portsmouth F.C.": "POR",
    "Preston North End F.C.": "PNE",
    "Queens Park Rangers F.C.": "QPR",
    "Sheffield United F.C.": "SHU",
    "Sheffield Wednesday F.C.": "SHW",
    "Stoke City F.C.": "STK",
    "Sunderland A.F.C.": "SUN",
    "Swansea City A.F.C.": "SWA",
    "Watford F.C.": "WAT",
    "West Bromwich Albion F.C.": "WBA"
}

df_subset["short_name"] = df_subset["team_name"].replace(name_map)

remove_words = ["F.C.", "F.C", "FC", "AFC", "A.F.C.", "A.F.C"]

def clean_team_name(name):
    for w in remove_words:
        name = name.replace(w, "")
    return name.strip()

df_subset["team_name"] = df_subset["team_name"].apply(clean_team_name)


name_map = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester utd",
    "Newcastle United": "Newcastle utd",
    "Sheffield United": "Sheffield utd",
    "Tottenham Hotspur": "Tottenham",
    "West Bromwich Albion": "West brom",
    "West Ham United": "West ham",
    "Wolverhampton Wanderers": "Wolves",
    "A Bournemouth": "Bournemouth",
    "Nottingham Forest": "Nott'ham forest"
}
# Thay thế tên đội
df_subset["team_name"] = df_subset["team_name"].replace(name_map)

# Loại bỏ "Q" và chuyển sang integer
df_subset["team_id"] = df_subset["team_id"].astype(str).str.replace("Q", "", regex=False)
df_subset["team_id"] = pd.to_numeric(df_subset["team_id"], errors='coerce').astype('Int64')

df_subset["stadium_id"] = df_subset["stadium_id"].astype(str).str.replace("Q", "", regex=False)
df_subset["stadium_id"] = pd.to_numeric(df_subset["stadium_id"], errors='coerce').astype('Int64')

df_subset.to_csv(r"D:\ETL_football\data_processed\dim_team.csv", index=False)

['competition_label', 'club_label', 'founding_year', 'venue_label', 'city_label', 'capacity', 'club_id', 'venue_id']


In [42]:
df_tmp = pd.read_csv("D:/ETL_Football/data_processed/dim_team.csv")
df_tmp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   team_id       44 non-null     int64 
 1   team_name     44 non-null     object
 2   founded_year  44 non-null     int64 
 3   stadium_id    44 non-null     int64 
 4   short_name    44 non-null     object
dtypes: int64(3), object(2)
memory usage: 1.8+ KB


In [48]:
# dữ liệu Dim_stadium
df = pd.read_csv(r"D:\ETL_Football\data\dim_team.csv")
print(df.columns.tolist())

# Loại bỏ dòng header trùng lặp (nếu có)
header_row = list(df.columns)
df = df[~df.apply(lambda row: list(row.values) == header_row, axis=1)].reset_index(drop=True)

df_subset=df[['venue_id',
              'venue_label',
              'capacity',]].copy()
df_subset.columns=['stadium_id','statium_name','capacity']

# Loại bỏ dòng có giá trị "capacity" trong cột capacity (nếu còn)
df_subset = df_subset[df_subset['capacity'].astype(str).str.lower() != 'capacity'].reset_index(drop=True)

# Loại bỏ "Q" và chuyển stadium_id sang integer
df_subset['stadium_id'] = df_subset['stadium_id'].astype(str).str.replace('Q', '', regex=False)
df_subset['stadium_id'] = pd.to_numeric(df_subset['stadium_id'], errors='coerce').astype('Int64')

# Chuyển capacity sang integer
df_subset['capacity'] = pd.to_numeric(df_subset['capacity'], errors='coerce')
df_subset = df_subset.dropna(subset=['capacity'])
df_subset['capacity'] = df_subset['capacity'].astype(int)

df_subset.to_csv(r"D:\ETL_football\data_processed\dim_stadium.csv",index=False)

['competition_label', 'club_label', 'founding_year', 'venue_label', 'city_label', 'capacity', 'club_id', 'venue_id']


In [49]:
df_tmp = pd.read_csv("D:/ETL_Football/data_processed/dim_stadium.csv")
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   stadium_id    44 non-null     int64 
 1   statium_name  44 non-null     object
 2   capacity      44 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


In [None]:

df = pd.read_csv(r"D:\ETL_Football\data\fbref_fact_team_match.csv")
df_team = pd.read_csv(r'D:\ETL_football\data_processed\dim_team.csv')       # team_id, team_name
df_match = pd.read_csv(r'D:\ETL_football\data_processed\dim_match.csv')     # game_id, game
df_player = pd.read_csv(r'D:\ETL_football\data_processed\dim_player.csv')   # player_id, player

#CHUẨN HÓA CHUỖI (rất quan trọng)

df['team'] = df['team'].astype(str).str.strip().str.lower()
df_team['team_name'] = df_team['team_name'].astype(str).str.strip().str.lower()

df['game'] = df['game'].astype(str).str.strip().str.lower()
df_match['game'] = df_match['game'].astype(str).str.strip().str.lower()

df['Captain'] = df['Captain'].astype(str).str.strip().str.lower()
df_player['player'] = df_player['player'].astype(str).str.strip().str.lower()

df['opponent'] = df['opponent'].astype(str).str.strip().str.lower()   # thêm chuẩn hóa opponent

#MAP TEAM → team_id

df = df.merge(
    df_team[['team_id', 'team_name']].rename(columns={'team_name': 'team'}),
    on='team',
    how='left'
)

df.rename(columns={'team_id': 'team_id'}, inplace=True)   # giữ nguyên, cho rõ ràng

#MAP OPPONENT → opponent_id

df = df.merge(
    df_team[['team_id', 'team_name']].rename(columns={'team_name': 'opponent'}),
    on='opponent',
    how='left',
    suffixes=('', '_opp')
)

df.rename(columns={'team_id_opp': 'opponent_id'}, inplace=True)

#MAP GAME → game_id

df = df.merge(
    df_match[['game_id', 'game']],
    on='game',
    how='left'
)

#MAP CAPTAIN → captain_id

df = df.merge(
    df_player[['player_id', 'player']],
    left_on='Captain',
    right_on='player',
    how='left'
)

df.rename(columns={'player_id': 'captain_id'}, inplace=True)
df.drop(columns=['player'], inplace=True)


# Loại bỏ "Q" và chuyển team_id và opponent_id sang integer
if 'team_id' in df.columns:
    df['team_id'] = df['team_id'].astype(str).str.replace('Q', '', regex=False)
    df['team_id'] = pd.to_numeric(df['team_id'], errors='coerce').astype('Int64')
if 'opponent_id' in df.columns:
    df['opponent_id'] = df['opponent_id'].astype(str).str.replace('Q', '', regex=False)
    df['opponent_id'] = pd.to_numeric(df['opponent_id'], errors='coerce').astype('Int64')

#CHUẨN HÓA CỘT round

df["round"] = df["round"].apply(lambda x: x.split()[-1].zfill(2))

#TẠO SUBSET CỘT FACT CUỐI

df_subset = df[
    [
        'season',
        'game_id',
        'team_id',
        'opponent_id',
        'round',
        'venue',
        'result',
        'GF',
        'GA',
        'xG',
        'xGA',
        'Poss',
        'captain_id',
        'Formation',
        'Opp Formation',
    ]
]

print(df_subset.head())


df_subset = df[
    [
        'season',
        'game_id',
        'team_id',
        'opponent_id',
        'round',
        'venue',
        'result',
        'GF',
        'GA',
        'xG',
        'xGA',
        'Poss',
        'captain_id',
        'Formation',
        'Opp Formation',
    ]
]
print(df_subset.head())

df_subset.to_csv(r'D:\ETL_Football\data_processed\fact_team_match_clean.csv', index=False)


Team không map: []
Opponent không map: []
Game không map: []
Captain không map: []
   season  game_id  team_id  opponent_id round venue result  GF  GA   xG  xGA  \
0    2021        1     9617        18708    01  Away      W   3   0  1.9  0.1   
1    2021        9     9617        18747    02  Home      W   2   1  1.1  2.0   
2    2021       28     9617      1130849    03  Away      L   1   3  1.3  2.7   
3    2021       37     9617        19607    04  Home      W   2   1  0.6  0.2   
4    2021       41     9617        50602    05  Away      L   0   1  0.9  1.3   

   Poss  captain_id Formation Opp Formation  
0    54          21     3-4-3       4-2-3-1  
1    62          21     3-4-3         5-4-1  
2    34          21     3-4-3         4-3-3  
3    64          21     4-3-3         3-5-2  
4    42          21     4-3-3       3-1-4-2  
   season  game_id  team_id  opponent_id round venue result  GF  GA   xG  xGA  \
0    2021        1     9617        18708    01  Away      W   3   0  1.9 

In [35]:
df_tmp = pd.read_csv("D:/ETL_Football/data_processed/fact_team_match_clean.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3800 entries, 0 to 3799
Data columns (total 27 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   league         3800 non-null   object 
 1   season         3800 non-null   int64  
 2   team           3800 non-null   object 
 3   game           3800 non-null   object 
 4   date           3800 non-null   object 
 5   time           3800 non-null   object 
 6   round          3800 non-null   object 
 7   day            3800 non-null   object 
 8   venue          3800 non-null   object 
 9   result         3800 non-null   object 
 10  GF             3800 non-null   int64  
 11  GA             3800 non-null   int64  
 12  opponent       3800 non-null   object 
 13  xG             3800 non-null   float64
 14  xGA            3800 non-null   float64
 15  Poss           3800 non-null   int64  
 16  Attendance     3102 non-null   float64
 17  Captain        3800 non-null   object 
 18  Formatio

In [52]:
# dữ liệu fact_playermatchstats
df=pd.read_csv(r"d:\ETL_Football\data\fbref_fact_player_match_stats.csv",header=[0,1,2])
print(df.columns.tolist())
df_match = pd.read_csv(r'D:\ETL_football\data_processed\dim_match.csv')     # game_id, game
df_player = pd.read_csv(r'D:\ETL_football\data_processed\dim_player.csv')   # player_id, player
df_team=pd.read_csv(r'D:\ETL_Football\data_processed\dim_team.csv')

# Xác định tên cột dựa trên cấu trúc thực tế (match 2 level đầu)
season_col_name = [col for col in df.columns if col[0] == 'season' and col[1] == 'Unnamed: 1_level_1'][0]
game_col_name = [col for col in df.columns if col[0] == 'game' and col[1] == 'Unnamed: 2_level_1'][0]
team_col_name = [col for col in df.columns if col[0] == 'team' and col[1] == 'Unnamed: 3_level_1'][0]
player_col_name = [col for col in df.columns if col[0] == 'player' and col[1] == 'Unnamed: 4_level_1'][0]

# Loại bỏ dòng đầu tiên nếu có giá trị "season" (dòng header thật trong file CSV)
if len(df) > 0 and str(df.iloc[0][season_col_name]).lower() == 'season':
    df = df.iloc[1:].reset_index(drop=True)
    print(f"Đã loại bỏ dòng header trùng lặp. Số dòng còn lại: {len(df)}")

# Tìm các cột khác bằng cách match 2 level đầu
min_col = [col for col in df.columns if col[0] == 'min' and col[1] == 'Unnamed: 9_level_1'][0]
gls_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Gls'][0]
ast_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Ast'][0]
pk_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'PK'][0]
pkatt_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'PKatt'][0]
sh_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Sh'][0]
sot_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'SoT'][0]
crdy_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'CrdY'][0]
crdr_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'CrdR'][0]
touches_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Touches'][0]
tkl_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Tkl'][0]
int_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Int'][0]
blocks_col = [col for col in df.columns if col[0] == 'Performance' and col[1] == 'Blocks'][0]
sca_col = [col for col in df.columns if col[0] == 'SCA' and col[1] == 'SCA'][0]
gca_col = [col for col in df.columns if col[0] == 'SCA' and col[1] == 'GCA'][0]
cmp_col = [col for col in df.columns if col[0] == 'Passes' and col[1] == 'Cmp'][0]
att_col = [col for col in df.columns if col[0] == 'Passes' and col[1] == 'Att'][0]
cmppct_col = [col for col in df.columns if col[0] == 'Passes' and col[1] == 'Cmp%'][0]
prgp_col = [col for col in df.columns if col[0] == 'Passes' and col[1] == 'PrgP'][0]
carries_col = [col for col in df.columns if col[0] == 'Carries' and col[1] == 'Carries'][0]
prgc_col = [col for col in df.columns if col[0] == 'Carries' and col[1] == 'PrgC'][0]
to_att_col = [col for col in df.columns if col[0] == 'Take-Ons' and col[1] == 'Att'][0]
to_succ_col = [col for col in df.columns if col[0] == 'Take-Ons' and col[1] == 'Succ'][0]

df_subset=df[[season_col_name,  # season
              game_col_name,  # game
              team_col_name,  # team
              player_col_name,  # player
              min_col,
              gls_col,
              ast_col,
              pk_col,
              pkatt_col,
              sh_col,
              sot_col,
              crdy_col,
              crdr_col,
              touches_col,
              tkl_col,
              int_col,
              blocks_col,
              sca_col,
              gca_col,
              cmp_col,
              att_col,
              cmppct_col,
              prgp_col,
              carries_col,
              prgc_col,
              to_att_col,
              to_succ_col]]


df_subset.columns = [
    'season',
    'game',
    'team',
    'player',
    'min_played',
    'goals',
    'assists',
    'penalty_made',
    'penalty_attempted',
    'shots',
    'shots_on_target',
    'yellow_cards',
    'red_cards',
    'touches',
    'tackles',
    'interceptions',
    'blocks',
    'shot_creating_actions',
    'goal_creating_actions',
    'passes_completed',
    'passes_attempted',
    'pass_completion_percent',
    'progressive_passes',
    'carries',
    'progressive_carries',
    'take_ons_attempted',
    'take_ons_successful'
]


# tên khác form
name_map = {
    "Brighton & Hove Albion": "Brighton",
    "Manchester United": "Manchester utd",
    "Newcastle United": "Newcastle utd",
    "Sheffield United": "Sheffield utd",
    "Tottenham Hotspur": "Tottenham",
    "West Bromwich Albion": "West brom",
    "West Ham United": "West ham",
    "Wolverhampton Wanderers": "Wolves",
    "Nottingham Forest": "Nott'ham forest"
}
# Thay thế tên đội
df_subset["team"] = df_subset["team"].replace(name_map)



# Chuẩn hóa chuỗi cho game
df_subset['game'] = df_subset['game'].astype(str).str.strip().str.lower()
df_match['game'] = df_match['game'].astype(str).str.strip().str.lower()

# Map game → game_id
df_subset = df_subset.merge(
    df_match[['game_id', 'game']],
    on='game',
    how='left'
)



# Chuẩn hóa cho team
df_subset['team'] = df_subset['team'].astype(str).str.strip().str.lower()
df_team['team_name'] = df_team['team_name'].astype(str).str.strip().str.lower()

# Map team → team_id
df_subset = df_subset.merge(
    df_team[['team_id', 'team_name']],
    left_on='team',
    right_on='team_name',
    how='left'
)

# Xóa cột thừa
if 'team_name' in df_subset.columns:
    df_subset.drop(columns=['team_name'], inplace=True)

# Đảm bảo team_id là integer (loại bỏ "Q" nếu có)
if 'team_id' in df_subset.columns:
    df_subset['team_id'] = df_subset['team_id'].astype(str).str.replace('Q', '', regex=False)
    df_subset['team_id'] = pd.to_numeric(df_subset['team_id'], errors='coerce').astype('Int64')

# Chuẩn hóa cho player
df_subset['player'] = df_subset['player'].astype(str).str.strip().str.lower()
df_player['player'] = df_player['player'].astype(str).str.strip().str.lower()

# Map player → player_id
df_subset = df_subset.merge(
    df_player[['player_id', 'player']],
    on='player',
    how='left'
)

print("Game không map:", df_subset[df_subset['game_id'].isna()]['game'].unique())
print("Team không map:", df_subset[df_subset['team_id'].isna()]['team'].unique())
print("Player không map:", df_subset[df_subset['player_id'].isna()]['player'].unique())

df_subset=df_subset[['season',
    'game_id',
    'team_id',
    'player_id',
    'min_played',
    'goals',
    'assists',
    'penalty_made',
    'penalty_attempted',
    'shots',
    'shots_on_target',
    'yellow_cards',
    'red_cards',
    'touches',
    'tackles',
    'interceptions',
    'blocks',
    'shot_creating_actions',
    'goal_creating_actions',
    'passes_completed',
    'passes_attempted',
    'pass_completion_percent',
    'progressive_passes',
    'carries',
    'progressive_carries',
    'take_ons_attempted',
    'take_ons_successful']]


df_subset.to_csv(r'D:\ETL_Football\data_processed\fact_player_match_clean.csv', index=False)

[('league', 'Unnamed: 0_level_1', 'ENG-Premier League'), ('season', 'Unnamed: 1_level_1', '2021'), ('game', 'Unnamed: 2_level_1', '2020-09-12 Crystal Palace-Southampton'), ('team', 'Unnamed: 3_level_1', 'Crystal Palace'), ('player', 'Unnamed: 4_level_1', 'Andros Townsend'), ('jersey_number', 'Unnamed: 5_level_1', '10'), ('nation', 'Unnamed: 6_level_1', 'ENG'), ('pos', 'Unnamed: 7_level_1', 'RM'), ('age', 'Unnamed: 8_level_1', '29-058'), ('min', 'Unnamed: 9_level_1', '90'), ('Performance', 'Gls', '0'), ('Performance', 'Ast', '1'), ('Performance', 'PK', '0'), ('Performance', 'PKatt', '0'), ('Performance', 'Sh', '0'), ('Performance', 'SoT', '0'), ('Performance', 'CrdY', '0'), ('Performance', 'CrdR', '0'), ('Performance', 'Touches', '38'), ('Performance', 'Tkl', '2'), ('Performance', 'Int', '1'), ('Performance', 'Blocks', '0'), ('Performance', 'xG', '0'), ('Expected', 'npxG', '0'), ('Expected', 'xAG', '0.6'), ('SCA', 'SCA', '2'), ('SCA', 'GCA', '1'), ('Passes', 'Cmp', '13'), ('Passes', 'At

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["team"] = df_subset["team"].replace(name_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['game'] = df_subset['game'].astype(str).str.strip().str.lower()


In [53]:
df_tmp = pd.read_csv("D:/ETL_Football/data_processed/fact_player_match_clean.csv")
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55173 entries, 0 to 55172
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   season                   55173 non-null  int64  
 1   game_id                  55173 non-null  int64  
 2   team_id                  55173 non-null  int64  
 3   player_id                55173 non-null  int64  
 4   min_played               55173 non-null  int64  
 5   goals                    55173 non-null  int64  
 6   assists                  55173 non-null  int64  
 7   penalty_made             55173 non-null  int64  
 8   penalty_attempted        55173 non-null  int64  
 9   shots                    55173 non-null  int64  
 10  shots_on_target          55173 non-null  int64  
 11  yellow_cards             55173 non-null  int64  
 12  red_cards                55173 non-null  int64  
 13  touches                  55173 non-null  int64  
 14  tackles               

In [None]:

df=pd.read_csv(r'D:\ETL_football\data\premier_league_last_5_seasons.csv')
df_team=pd.read_csv(r'D:\ETL_Football\data_processed\dim_team.csv')


def convert_season(season):
    # season dạng "2024/2025"
    parts = season.split("/")
    y1 = parts[0][-2:]   # lấy 2 số cuối của năm đầu
    y2 = parts[1][-2:]   # lấy 2 số cuối của năm sau
    return y1 + y2       # ghép lại thành 2425

df["Mùa giải"] = df["Mùa giải"].apply(convert_season)
df = df.rename(columns={"Mùa giải": "season_id"})


# tên khác form
name_map = {
    "Ipswich": "Ipswich Town",
    "Luton": "Luton Town",
   "Newcastle": "Newcastle utd",
    "Leeds": "Leeds United",
    "Leicester": "Leicester City",
    "Norwich": "Norwich City",
    "Nottingham": "Nott'ham forest"
}
# Thay thế tên đội
df["Team"] = df["Team"].replace(name_map)


# Chuẩn hóa cho team
df['Team'] = df['Team'].astype(str).str.strip().str.lower()
df_team['team_name'] = df_team['team_name'].astype(str).str.strip().str.lower()



# Map team → team_id
df = df.merge(
    df_team[['team_id', 'team_name']],
    left_on='Team',
    right_on='team_name',
    how='left'
)

# Xóa cột thừa
if 'team_name' in df.columns:
    df.drop(columns=['team_name'], inplace=True)


# đổi kiểu dữ liệu cột rank
df["Rank"] = df["Rank"].astype(int)

# Tách GF và GA từ cột "GF:GA"
df[["GF", "GA"]] = df["GF:GA"].str.split(":", expand=True)

# Chuyển kiểu dữ liệu sang int
df["GF"] = df["GF"].astype(int)
df["GA"] = df["GA"].astype(int)

# Xóa cột cũ
df.drop(columns=["GF:GA"], inplace=True)


df_subset=df[["season_id","Match_Category","Rank","team_id","MP","W","D","L","GF","GA","GD","Pts","Recent_Form"]]

df_subset.to_csv(r'D:\ETL_Football\data_processed\fact_team_point.csv', index=False)

Team không map: []


In [40]:
df_tmp = pd.read_csv("D:/ETL_Football/data_processed/fact_team_point.csv")
df_tmp.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   season_id       300 non-null    int64 
 1   Match_Category  300 non-null    object
 2   Rank            300 non-null    int64 
 3   team_id         300 non-null    int64 
 4   MP              300 non-null    int64 
 5   W               300 non-null    int64 
 6   D               300 non-null    int64 
 7   L               300 non-null    int64 
 8   GF              300 non-null    int64 
 9   GA              300 non-null    int64 
 10  GD              300 non-null    int64 
 11  Pts             300 non-null    int64 
 12  Recent_Form     300 non-null    object
dtypes: int64(11), object(2)
memory usage: 30.6+ KB
