##Prepare

In [1]:
# 掛載使用者的Google雲端硬碟 (drive/MyDrive) 到 Colab 的工作磁碟
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
import requests
from pathlib import Path
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [14]:
pd.set_option("display.max.columns", None)
pd.set_option("display.precision",2)

In [15]:
downloadurl = "https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv"
target_csv_path = Path.cwd() / 'drive' / 'MyDrive' / '資料分析案例實作' / 'NBA' / 'nba_all_elo.csv'
response = requests.get(downloadurl)
if response.status_code == requests.codes.ok:
    with target_csv_path.open(mode='wb') as file:
        file.write(response.content)
    print("Download ready.")
else:
    print("Can NOT download.")

Download ready.


## Process

## 了解資料集結構

In [18]:
nba = pd.read_csv(target_csv_path).convert_dtypes()
nba.head(5)

Unnamed: 0,gameorder,game_id,lg_id,_iscopy,year_id,date_game,seasongame,is_playoffs,team_id,fran_id,pts,elo_i,elo_n,win_equiv,opp_id,opp_fran,opp_pts,opp_elo_i,opp_elo_n,game_location,game_result,forecast,notes
0,1,194611010TRH,NBA,0,1947,11/1/1946,1,0,TRH,Huskies,66,1300.0,1293.28,40.29,NYK,Knicks,68,1300.0,1306.72,H,L,0.64,
1,1,194611010TRH,NBA,1,1947,11/1/1946,1,0,NYK,Knicks,68,1300.0,1306.72,41.71,TRH,Huskies,66,1300.0,1293.28,A,W,0.36,
2,2,194611020CHS,NBA,0,1947,11/2/1946,1,0,CHS,Stags,63,1300.0,1309.65,42.01,NYK,Knicks,47,1306.72,1297.07,H,W,0.63,
3,2,194611020CHS,NBA,1,1947,11/2/1946,2,0,NYK,Knicks,47,1306.72,1297.07,40.69,CHS,Stags,63,1300.0,1309.65,A,L,0.37,
4,3,194611020DTF,NBA,0,1947,11/2/1946,1,0,DTF,Falcons,33,1300.0,1279.62,38.86,WSC,Capitols,50,1300.0,1320.38,H,L,0.64,


### 基本統計 Basic statistics

In [19]:
nba.shape

(126314, 23)

In [None]:
nba.info()

In [None]:
nba.describe()

### Question (team_id vs fran_id)
查看team_id和fran_id欄位。資料集包含104個不同的team_id，但僅包含 53 個不同的fran_id。此外，最頻繁的team_id是BOS，但最頻繁的fran_id是湖人隊。可能嗎？

In [None]:
nba["team_id"].value_counts()

In [None]:
nba["fran_id"].value_counts()

It seems that a team named "Lakers" played 6024 games, but only 5078 of those were played by the Los Angeles Lakers. Find out who the other "Lakers" team is:

In [None]:
# df.loc[targeted_rows, return_column]
nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()

the Minneapolis Lakers ("MNL") played 946 games.
You can even find out when they played those games.

In [None]:
nba["date_played"] = pd.to_datetime(nba["date_game"])
nba["date_played"]

In [None]:
nba.loc[nba["team_id"] == "MNL", "date_played"].min()

In [None]:
nba.loc[nba["team_id"] == "MNL", "date_played"].max()

In [None]:
# Aggregate using one or more operations over the specified axis. df.agg(["sum", "min"])
nba.loc[nba["team_id"] == "MNL", "date_played"].agg(["min", "max"])

### Question (BOS Celtics)
我們還發現了波士頓塞爾提克隊「BOS」在數據集中打了最多的比賽。算一下BOS在此數據集中的所有比賽總共得了幾分


In [None]:
# Find out how many points the Boston Celtics (BOS) have scored during all games
nba.loc[nba["team_id"] == "BOS", "pts"].sum()

In [None]:
nba.columns

In [None]:
"points" in nba.columns

In [None]:
"pts" in nba.columns

In [None]:
nba.iloc[-2]

In [None]:
nba.loc[5555:5559, ["fran_id", "opp_fran", "pts", "opp_pts"]]

In [None]:
nba.head()

In [None]:
current_decade = nba.loc[nba["year_id"] > 2010]

In [None]:
current_decade.shape

In [None]:
games_with_notes = nba[nba["notes"].notnull()]

In [None]:
games_with_notes.shape

In [None]:
nba.loc[nba["fran_id"].str.endswith("ers"), "fran_id"]

In [None]:
ers = nba.loc[nba["fran_id"].str.endswith("ers")]

In [None]:
ers.shape

In [None]:
nba.loc[(nba["_iscopy"] == 0) &
(nba["pts"] > 100) &
(nba["opp_pts"] > 100) &
(nba["team_id"] == "BLB")]

In [None]:
nba.loc[(nba["_iscopy"] == 0) &
(nba["team_id"].str.startswith("LA")) &
(nba["notes"].notnull()) &
(nba["year_id"] == 1992)]

In [None]:
points = nba["pts"]

In [None]:
type(points)

In [None]:
points.sum()

In [None]:
nba.groupby("fran_id", sort=False)["pts"].sum()

In [None]:
nba[
    (nba["fran_id"] == "Spurs") &
    (nba["year_id"] > 2010)].groupby(["year_id", "game_result"])["game_id"].count()

In [None]:
nba.dtypes

In [None]:
nba[
    (nba["fran_id"] == "Warriors") &
    (nba["year_id"] == 2015)].groupby(["is_playoffs", "game_result"])["game_id"].count()

In [None]:
df = nba.copy()

In [None]:
df.shape

In [None]:
nba.shape

In [None]:
df["difference"] = df["pts"] - df["opp_pts"]

In [None]:
df.shape

In [None]:
df["difference"].max()

In [None]:
df["difference"].min()

In [None]:
renamed_df = df.rename(columns={"game_result":"result", "game_location":"location"})

In [None]:
renamed_df.info()

In [None]:
df.info()

In [None]:
elo_columns = ["elo_i", "elo_n", "opp_elo_i", "opp_elo_n"]

In [None]:
df.drop(elo_columns, inplace=True, axis = 1)

In [None]:
df.shape

In [None]:
df["date_game"]

In [None]:
df["date_game"] = pd.to_datetime(df["date_game"])

In [None]:
df["date_game"]

In [None]:
df["game_location"].nunique()

In [None]:
df["game_location"].value_counts()

In [None]:
df["game_location"] = pd.Categorical(df["game_location"])

In [None]:
df["game_location"].dtype

In [None]:
df.info()

In [None]:
df["game_result"].value_counts()

In [None]:
df["game_result"] = pd.Categorical(df["game_result"])

In [None]:
df.info()

In [None]:
df.shape

In [None]:
nba.shape

In [None]:
rows_without_missing_data = nba.dropna()

In [None]:
rows_without_missing_data.shape

In [None]:
data_without_missing_columns = nba.dropna(axis = 1)

In [None]:
data_without_missing_columns.shape

In [None]:
data_with_default_notes = nba.copy()

In [None]:
data_with_default_notes["notes"].fillna(value="no notes at all", inplace=True)

In [None]:
data_with_default_notes["notes"].describe()

In [None]:
nba.loc[nba["pts"]==0]

In [None]:
nba[(nba["pts"] > nba["opp_pts"]) & (nba["game_result"] != "W")].empty

In [None]:
nba[(nba["pts"] < nba["opp_pts"]) & (nba["game_result"] != "L")].empty

In [None]:
%matplotlib inline

In [None]:
nba.loc[nba["fran_id"] == "Knicks"].groupby("year_id")["pts"].sum().plot()

In [None]:
nba["fran_id"].value_counts().head(10).plot(kind="bar")

In [None]:
nba[(nba["fran_id"] == "Heat") & (nba["year_id"] == 2013)]["game_result"].value_counts()\
.plot(kind="pie")