This guide covers essential pandas syntax across data inspection, basic manipulation, aggregation, joins/unions, datetime operations, and null handling.
Getting Started
To get started with this project, follow these steps:
-
Clone the repository:
git clone https://github.com/MarkPhamm/pandas-tutorial.git cd pandas-tutorial -
Create a virtual environment:
python -m venv venv
-
Activate the virtual environment:
- On Windows:
venv\Scripts\activate
- On macOS and Linux:
source venv/bin/activate
- On Windows:
-
Install the required packages:
pip install -r requirements.txt
df.head()df.tail()df.columnsdf['col']
# or
df.coldf.shapedf.info()df.describe()df.duplicated().sum()df['col'].value_counts()if df.empty:
df = pd.DataFrame({'SecondHighestSalary': [None]}) df[['col1', 'col2']]df[df['col'] > 100]df[df['col'].between(10, 50)]df.rename(columns={'old_name': 'new_name'})df.sort_values(by='col', ascending=False)df.loc[5]df.iloc[5]df = df.astype({'col_name': 'desired_dtype'})
# or
df['col_name'] = df['col_name'].astype('desired_dtype')df[~df['col'] == 'target']df.groupby('group_col')['value_col'].sum().reset_index()df.groupby('group_col')['value_col'].count().reset_index()df.groupby('group_col')['value_col'].nunique().reset_index()df.groupby('group_col').agg({
'col1': 'sum',
'col2': 'mean',
'col3': 'nunique'
}).reset_index()df1.merge(df2, left_on='key1', right_on='key2', how='inner')df1.merge(df2, on='key', how='left')pd.concat([df1, df2], ignore_index=True)pd.concat([df1, df2], ignore_index=True).drop_duplicates()pd.Timestamp('2025-04-12')pd.Timedelta(days=30)df['date_col'] = pd.to_datetime(df['date_col'])df['day'] = df['date_col'].dt.daydf['month'] = df['date_col'].dt.to_period('M')(df['end_time'] - df['start_time']).dt.total_seconds()df['col'].isnull()df[df['col'].isnull()]df.drop_duplicates()df.drop_duplicates(subset=['col1', 'col2'])df.drop_duplicates(keep='last')df[df.duplicated() == False]df[df.duplicated()]df.duplicated()df = pd.read_csv('data.csv')df = pd.read_csv('data.csv', index_col=0)df.to_csv('output.csv', index=False)df = pd.read_csv('data.csv', usecols=['col1', 'col2'])df = pd.read_csv('data.csv', nrows=100)df = pd.read_csv('data.csv', skiprows=1)df = pd.read_csv('data.csv', na_values=['NA', 'null', 'NaN'])