In [None]:
import pandas as pd 
import requests
import numpy as np

In [None]:
# Creating a Pandas Series from a list of revenue numbers
revenues = pd.Series([5555, 7000, 1980])
revenues  # Displaying the Series

In [None]:
# Accessing the values of the Series
revenues.values

In [None]:
# Accessing the index of the Series
revenues.index

In [None]:
# Creating a Series with explicit index (cities)
city_revenues = pd.Series([4200, 8000, 6500], index=['Amsterdam', 'Toronto', 'Tokyo'])
city_revenues  # Displaying the Series with city names as indexes

In [None]:
# Accessing a single value by index label
city_revenues['Toronto']

In [None]:
# Creating a Series from a dictionary, automatically using the keys as the index
city_employee_count_data = {'Amsterdam': 5, 'Tokyo': 8}
city_employee_count = pd.Series(city_employee_count_data)
city_employee_count  # Displaying the Series

In [None]:
# Accessing the index keys of the Series
city_employee_count.keys()

In [None]:
# Checking if an index label is in the Series
'Tokyo' in city_employee_count

In [None]:
# Creating a DataFrame from a dictionary of Series
city_data = pd.DataFrame({
    'revenue': city_revenues,
    'employee_count': city_employee_count
})
city_data  # Displaying the DataFrame

In [None]:
# Accessing the axes (index labels and column names) of the DataFrame
city_data.axes

In [None]:
# Accessing a value by integer location
city_revenues[1]

In [None]:
# Accessing a value by index label
city_revenues['Toronto']

In [None]:
# Accessing the last value by negative indexing (not supported with default index)
city_revenues[-1]

In [None]:
# Slicing the Series from a specified label to the end
city_revenues['Toronto':]

In [None]:
# Creating a Series with a non-default index
colors = pd.Series(['red', 'purple', 'blue', 'green', 'yellow'], index=[1, 2, 3, 5, 8])
colors  # Displaying the Series

In [None]:
# Accessing a value by the non-default integer index
colors[1]

In [None]:
# Accessing a value using the .loc indexer
colors.loc[1]

In [None]:
# Accessing a value using the .iloc indexer for position-based indexing
colors.iloc[1]

In [None]:
# Slicing the Series using .iloc for positions
colors.iloc[1:3]

In [None]:
# Slicing the Series using .loc for labels
colors.loc[3:8]

In [None]:
# Accessing the second to last value using .iloc (negative indexing)
colors.iloc[-2]

In [None]:
# Accessing a DataFrame column as a Series
city_data.revenue

In [None]:
# Accessing a row by label
city_data.loc['Amsterdam']

In [None]:
# Accessing a row by integer location
city_data.iloc[1]

In [None]:
# Slicing rows by label
city_data.loc['Tokyo': 'Toronto']

In [None]:
# Accessing a specific column for a range of rows by labels
city_data.loc['Amsterdam': 'Tokyo', 'revenue']

In [None]:
# Displaying the city_revenues Series
city_revenues

In [None]:
# Calculating the sum of the city_revenues Series
city_revenues.sum()

In [None]:
# Finding the maximum value in the city_revenues Series
city_revenues.max()

In [None]:
# Finding the minimum value in the city_revenues Series
city_revenues.min()

In [None]:
# Calculating the mean of the city_revenues Series
city_revenues.mean()

In [None]:
# Displaying the city_data DataFrame
city_data

In [None]:
# Creating another DataFrame with further city data
further_city_data = pd.DataFrame({
    'revenue': [7000, 3400],
    'employee_count': [2, 2]
}, index=['New York', 'Barcelona'])
all_city_data = pd.concat([city_data, further_city_data], sort=False)  # Concatenating DataFrames vertically
all_city_data  # Displaying the concatenated DataFrame

In [None]:
# Creating a DataFrame with city countries
city_countries = pd.DataFrame({
    'country': ['Holland', 'Japan', 'Holland', 'Canada', 'Spain'],
    'capital': [1, 1, 0, 0, 0]
}, index=['Amsterdam', 'Tokyo', 'Rotterdam', 'Toronto', 'Barcelona'])

In [None]:
# Concatenating city data with country data horizontally, joining on common indexes
cities = pd.concat([all_city_data, city_countries], axis=1, sort=False, join='inner')
cities  # Displaying the concatenated DataFrame

In [None]:
# Creating a DataFrame with country data
countries = pd.DataFrame({
    'population_millions': [17, 127, 37],
    'continent': ['Europe', 'Asia', 'North America']
}, index=['Holland', 'Japan', 'Canada'])

In [None]:
# Merging city data with country data using a left join on 'country', keeping all cities
pd.merge(cities, countries, left_on='country', right_index=True)

In [None]:
# The same merge as above, explicitly specifying the join type as 'left'
pd.merge(cities, countries, left_on='country', right_index=True, how='left')

In [None]:
download_url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/nba-elo/nbaallelo.csv'

response = requests.get(download_url)

with open('nba_all_elo.csv', 'wb') as f:
    f.write(response.content)
    
nba = pd.read_csv('nba_all_elo.csv')
nba.head()

In [None]:
nba.tail()  # Displays the last 5 rows of the nba DataFrame

In [None]:
nba.tail(10)  # Displays the last 10 rows of the nba DataFrame

In [None]:
nba.info()  # Provides a summary of the nba DataFrame, including the number of non-null entries for each column

In [None]:
nba.describe()  # Generates descriptive statistics that summarize the central tendency, dispersion, and shape of the dataset’s distribution, excluding NaN values

In [None]:
nba['team_id'].value_counts()  # Returns counts of unique values for 'team_id', showing the distribution of teams

In [None]:
nba['fran_id'].value_counts()  # Returns counts of unique values for 'fran_id', showing the distribution of franchise ids

In [None]:
nba['fran_id'] == 'Lakers'  # Creates a boolean Series indicating rows where 'fran_id' is 'Lakers'

In [None]:
nba.loc[nba['fran_id'] == 'Lakers']  # Selects rows where 'fran_id' is 'Lakers'

In [None]:
nba.loc[nba['fran_id'] == 'Lakers', 'team_id']  # Selects 'team_id' for rows where 'fran_id' is 'Lakers'

In [None]:
nba.loc[nba['fran_id'] == 'Lakers', 'team_id'].value_counts()  # Counts of 'team_id' for Lakers, showing distribution of team IDs for Lakers

In [None]:
nba.loc[nba['team_id'] == 'MNL', 'date_game']  # Selects 'date_game' for rows where 'team_id' is 'MNL'

In [None]:
nba.loc[nba['team_id'] == 'MNL', 'date_game'].max()  # Finds the latest 'date_game' for 'MNL' team

In [None]:
nba['date_played'] = pd.to_datetime(nba['date_game'])  # Converts 'date_game' column to datetime and stores in 'date_played'

In [None]:
nba.loc[nba['team_id'] == 'MNL', 'date_played'].max()  # Finds the latest 'date_played' for 'MNL' team

In [None]:
total_bos_points = nba.loc[nba['team_id'] == 'BOS', 'pts'].sum()  # Sums up all points scored by 'BOS' team

In [None]:
years_bos_played = len(nba.loc[nba['team_id'] == 'BOS', 'date_played'].apply(lambda v: v.year).unique())  # Counts unique years 'BOS' team played

In [None]:
total_bos_points / years_bos_played  # Calculates average points per year for 'BOS' team

In [None]:
'points' in nba.axes[1]  # Checks if 'points' is a column in the DataFrame

In [None]:
'pts' in nba.axes[1]  # Checks if 'pts' is a column in the DataFrame

In [None]:
'pts' in nba.keys()  # Another way to check if 'pts' is a column in the DataFrame

In [None]:
nba.iloc[-2]  # Selects the second to last row of the DataFrame

In [None]:
nba.loc[5555:5559, ['fran_id', 'opp_fran', 'pts', 'opp_pts']]  # Selects rows 5555 to 5559 and specific columns

In [None]:
current_decade = nba[nba['year_id'] > 2010]  # Filters rows where 'year_id' is greater than 2010

In [None]:
current_decade.shape  # Shows the shape (number of rows and columns) of the filtered DataFrame

In [None]:
nba['year_id'] > 2010  # Creates a boolean Series for years greater than 2010

In [None]:
nba[nba['year_id'] > 2010]  # Filters rows for games played after 2010

In [None]:
nba['notes'].notnull()  # Creates a boolean Series for rows where 'notes' column is not null

In [None]:
games_with_notes = nba[nba['notes'].notnull()]  # Filters rows where 'notes' column is not null
games_with_notes  # Displays the filtered DataFrame

In [None]:
ers_teams = nba[nba['fran_id'].str.endswith('ers')]  # Filters teams whose franchise id ends with 'ers'
ers_teams  # Displays the filtered DataFrame

In [None]:
nba[(nba['team_id'] == 'BLB') & (nba['pts'] > 100) & (nba['opp_pts'] > 100)]  # Filters games where 'BLB' scored and conceded more than 100 points

In [None]:
nba['pts'].sum()  # Sums up all points scored in the dataset

In [None]:
nba['fran_id'].unique()  # Returns unique franchise ids in the dataset

In [None]:
group_fran = nba.groupby('fran_id', sort=False)  # Groups data by 'fran_id', without sorting
group_fran  # Displays the GroupBy object

In [None]:
group_fran['pts'].sum()  # Sums up points for each franchise

In [None]:
year_results = nba[nba['fran_id'] == 'Bulls'].groupby(['year_id', 'game_result'])  # Groups 'Bulls' games by year and result

In [None]:
year_results['game_id'].count()  # Counts games in each group

In [None]:
nba[
    (nba['fran_id'] == 'Warriors') &
    (nba['year_id'] == 2015)
].groupby(['is_playoffs', 'game_result'])['game_id'].count()  
# Groups 'Warriors' 2015 games by playoffs status and result, counts games

In [None]:
nba_copy = nba.copy()  # Creates a copy of the nba DataFrame

In [None]:
nba_copy['difference'] = nba_copy['pts'] - nba_copy['opp_pts']  # Calculates point difference for each game and stores in 'difference'

In [None]:
nba_copy.head()  # Displays the first 5 rows of the copied DataFrame

In [None]:
nba_copy['difference'].max()  # Finds the maximum point difference

In [None]:
nba_copy.rename(
    columns={
        'game_result': 'result',
        'game_location': 'location'
    },
    inplace=True
)  # Renames columns 'game_result' and 'game_location' in place

In [None]:
elo_columns = ['elo_i', 'elo_n', 'opp_elo_i', 'opp_elo_n']  # Defines a list of columns related to ELO ratings

In [None]:
nba_copy.drop(elo_columns, axis=1)  # Returns a new DataFrame with ELO columns dropped

In [None]:
nba_copy.info()  # Provides a summary of the nba_copy DataFrame, including the number of non-null entries for each column

In [None]:
nba_copy.drop(elo_columns, axis=1, inplace=True)  # Drops the ELO columns from nba_copy in place

In [None]:
nba_copy.info()  # Provides an updated summary of the nba_copy DataFrame

In [None]:
nba['date_game']  # Displays the 'date_game' column

In [None]:
nba['date_game'] = pd.to_datetime(nba['date_game'])  # Converts 'date_game' column to datetime

In [None]:
nba.info()  # Provides a summary of the nba DataFrame after the conversion

In [None]:
nba['game_location'].unique()  # Displays unique values in the 'game_location' column

In [None]:
nba['game_location'] = pd.Categorical(nba['game_location'])  # Converts 'game_location' column to categorical type

In [None]:
nba.info()  # Provides a summary of the nba DataFrame after converting 'game_location' to categorical

In [None]:
nba['game_location'].dtype  # Displays the data type of 'game_location' column

In [None]:
nba['game_result'].unique()  # Displays unique values in the 'game_result' column

In [None]:
nba['game_result'] = pd.Categorical(nba['game_result'])  # Converts 'game_result' column to categorical type
nba.info()  # Provides a summary of the nba DataFrame after converting 'game_result' to categorical

In [None]:
clean_nba = nba.dropna()  # Creates a new DataFrame by dropping rows with any missing values

In [None]:
clean_nba.shape  # Displays the shape of the clean_nba DataFrame

In [None]:
clean_nba = nba.dropna(axis=1)  # Creates a new DataFrame by dropping columns with any missing values

In [None]:
clean_nba.shape  # Displays the shape of the clean_nba DataFrame after dropping columns

In [None]:
clean_nba.info()  # Provides a summary of the clean_nba DataFrame

In [None]:
nba_copy = nba.copy()  # Creates a copy of the nba DataFrame

In [None]:
nba_copy['notes'].fillna(
    value='no notes at all',
    inplace=True
)  # Replaces missing values in 'notes' column with 'no notes at all' in place
nba_copy.info()  # Provides a summary of the nba_copy DataFrame after filling missing values

In [None]:
nba_copy['notes'].describe()  # Generates descriptive statistics for the 'notes' column

In [None]:
nba.describe()  # Generates descriptive statistics for numerical columns in the original nba DataFrame

In [None]:
nba[nba['pts'] == 0]  # Filters games where a team scored 0 points

In [None]:
nba[(nba['pts'] < nba['opp_pts']) & (nba['game_result'] != 'L')].empty  # Checks if there are any anomalies where a team scored fewer points but didn't lose

In [None]:
nba[(nba['pts'] > nba['opp_pts']) & (nba['game_result'] != 'W')].empty  # Checks if there are any anomalies where a team scored more points but didn't win

In [None]:
knicks_pts_by_year = nba[nba['fran_id'] == 'Knicks'].groupby('year_id')['pts'].sum()

In [None]:
knicks_pts_by_year

In [None]:
!pip install matplotlib

In [None]:
knicks_pts_by_year.plot()

In [None]:
knicks_pts_by_year.tail(10).plot(kind='bar')

In [None]:
nba[(nba['fran_id'] == 'Knicks') & (nba['year_id'] > 2000)].plot(kind='scatter', x='pts', y='opp_pts', s=0.5)

In [None]:
nba[nba['pts'] == nba['opp_pts']].empty

In [None]:
heat_13_wl = nba[
    (nba['fran_id'] == 'Heat') &
    (nba['year_id'] == 2013)
]['game_result'].value_counts()
heat_13_wl

In [None]:
heat_13_wl.plot(kind='pie')