In [None]:
years = list(range(2022, 1996, -1))
all_matches = []
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd


standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all("a")]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(10)

In [None]:
all_matches

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv("matches.csv")

In [None]:
match_df

In [None]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [None]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
ls

In [None]:
pwd

In [None]:
cd ..

In [None]:
ls

In [None]:
cd home

In [None]:
ls

In [None]:
matches = pd.read_csv("matches (1).csv", index_col = 0)

In [None]:
matches.shape

In [None]:
matches["team"].value_counts()

In [None]:
matches[matches["team"] == "Chelsea"]

In [None]:
matches.dtypes

In [None]:
matches["date"] = pd.to_datetime(matches["date"])

In [None]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [None]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [None]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex =True).astype("int")

In [None]:
matches["day_code"] = matches["date"].dt.dayofweek

In [None]:
matches["target"] = (matches["result"] == "W").astype("int")

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches["date"] < '2022-01-01']

In [None]:
test = matches[matches["date"] > '2022-01-01']

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [None]:
rf.fit(train[predictors], train["target"])

In [None]:
preds = rf.predict(test[predictors])

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
acc = accuracy_score(test["target"], preds)

In [None]:
acc

In [None]:
combined = pd.DataFrame(dict(actual = test["target"], prediction=preds))

In [None]:
pd.crosstab(index = combined["actual"], columns = combined["prediction"])

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(test["target"], preds)

0.4904632152588556

In [None]:
grouped_matches = matches.groupby("team")

In [None]:
group = grouped_matches.get_group("Chelsea")

In [None]:
group

In [None]:
def rolling_averages(group,cols,new_cols):
  group = group.sort_values("date")
  rolling_stats = group[cols].rolling(3,closed="left").mean()
  group[new_cols] = rolling_stats
  group = group.dropna(subset=new_cols)
  return group

In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]

In [None]:
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
new_cols

In [None]:
rolling_averages(group,cols,new_cols)

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols,new_cols))

In [None]:
matches_rolling

In [None]:
matches_rolling.droplevel("team")

In [None]:
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

In [None]:
def make_predictions(data,predictors):
  train = data[data["date"] < '2022-01-01']
  test = data[data["date"] > '2022-01-01']
  rf.fit(train[predictors], train["target"])
  preds = rf.predict(test[predictors])
  combined = pd.DataFrame(dict(actual = test["target"], predicted = preds), index = test.index)
  precision = precision_score(test["target"], preds)
  return combined,precision


In [None]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [None]:
precision

In [None]:
combined

In [None]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index=True)

In [None]:
combined

In [None]:
class MissingDict(dict):
  __missing__ = lambda self,key: key

map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}

mapping = MissingDict(**map_values)

In [None]:
mapping["West Ham United"]

In [None]:
combined["new_team"] = combined["team"].map(mapping)

In [None]:
combined

In [None]:
merged = combined.merge(combined, left_on = ["date", "new_team"], right_on = ["date", "opponent"])

In [None]:
merged

In [None]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

1    170
0     99
Name: actual_x, dtype: int64

In [None]:
170/(280)

0.6071428571428571