In [1]:
import pandas as pd
print(pd.__version__)

1.5.3


#### Series

In [2]:
# Creating a Series from a list
data_list = [10, 20, 30, 40, 50]
series_from_list = pd.Series(data_list)
print(series_from_list)

# Creating a Series from a NumPy array
import numpy as np
data_array = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
series_from_array = pd.Series(data_array)
print(series_from_array)

# Creating a Series from a dictionary
data_dict = {'A': 100, 'B': 200, 'C': 300, 'D': 400}
series_from_dict = pd.Series(data_dict)
print(series_from_dict)

0    10
1    20
2    30
3    40
4    50
dtype: int64
0    1.1
1    2.2
2    3.3
3    4.4
4    5.5
dtype: float64
A    100
B    200
C    300
D    400
dtype: int64


In [4]:
# Accessing series elements by index
print(series_from_list[0])
print(series_from_array[2])
print(series_from_dict['B'])

# Accessing elements by label-based index
print(series_from_list.iloc[1])
print(series_from_array.loc[2])

10
3.3
200
20
3.3


In [None]:
# Mathematical operations on series
result = series_from_list + series_from_array
print(result)

# Conditional filtering
filtered_series = series_from_list[series_from_list > 20]
print(filtered_series)

# Check for null values
print(series_from_dict.isnull())

#### Dataframe

In [6]:
# Creating a DataFrame from a dictionary
data_dict = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'London', 'Paris', 'Tokyo']
}
df = pd.DataFrame(data_dict)
print(df)
print("----------------")
# Creating a DataFrame from a list of dictionaries
data_list_of_dicts = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'London'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Paris'},
    {'Name': 'David', 'Age': 40, 'City': 'Tokyo'}
]
df_from_list_of_dicts = pd.DataFrame(data_list_of_dicts)
print(df_from_list_of_dicts)

# Creating a DataFrame from external data (e.g., CSV file)
# df = pd.read_csv('data.csv')

      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris
3    David   40     Tokyo
----------------
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Paris
3    David   40     Tokyo


In [9]:
# Displaying basic information about the DataFrame
print(df.info())

# Displaying summary statistics
print(df.describe())

# Accessing specific columns
print(df['Name'])
print(df.Age)
print("--------------")
# Accessing rows using iloc (integer-location based indexing)
print(df.iloc[0])  # First row
print(df.iloc[1:3])  # Rows 2 and 3
print("--------------")
# Accessing rows using loc (label-location based indexing)
print("First row:")
print(df.loc[0])  # First row
print("Rows 2 and 3:")
print(df.loc[1:2])  # Rows 2 and 3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes
None
             Age
count   4.000000
mean   32.500000
std     6.454972
min    25.000000
25%    28.750000
50%    32.500000
75%    36.250000
max    40.000000
0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object
0    25
1    30
2    35
3    40
Name: Age, dtype: int64
--------------
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
      Name  Age    City
1      Bob   30  London
2  Charlie   35   Paris
--------------
First row:
Name       Alice
Age           25
City    New York
Name: 0, dtype: object
Rows 2 and 3:
      Name  Age    City
1      Bob   30  London
2  Charlie   35   Paris


In [12]:
# Reading data from a CSV file
#df = pd.read_csv('data.csv')

# Reading data from an Excel file
#df = pd.read_excel('data.xlsx')

# Reading data from a SQL database
# import sqlite3
# conn = sqlite3.connect('database.db')
# df = pd.read_sql_query('SELECT * FROM table_name', conn)

# Writing data to a CSV file
#df.to_csv('output.csv', index=False)

# Writing data to an Excel file
#df.to_excel('output.xlsx', index=False)

# Writing data to a SQL database
# df.to_sql('table_name', conn, if_exists='replace', index=False)

In [None]:
# Selecting a single column
name_column = df['Name']
print(name_column)

# Selecting multiple columns
name_age_columns = df[['Name', 'Age']]
print(name_age_columns)

In [None]:
# Filtering rows based on a condition
filtered_df = df[df['Age'] > 30]
print(filtered_df)

# Multiple conditions (use & for AND, | for OR)
filtered_df = df[(df['Age'] > 25) & (df['City'] == 'New York')]
print(filtered_df)

In [None]:
# Adding a new column
df['Salary'] = [50000, 60000, 70000, 80000]
print(df)

# Removing a column
df = df.drop('Salary', axis=1)  # axis=1 means column-wise operation
print(df)


In [None]:
# Applying a function to a column
df['Age'] = df['Age'].apply(lambda x: x + 5)
print(df)

In [None]:
# Dropping duplicates
df.drop_duplicates(subset=['Name'], keep='first', inplace=True)

In [None]:
Pandas provides various methods to handle missing data in DataFrames, such as dropna, fillna, and interpolate.

# Creating a DataFrame with missing data
data = {'A': [1, 2, np.nan, 4, 5],
        'B': [6, np.nan, 8, 9, 10],
        'C': [11, 12, 13, np.nan, 15]}
df = pd.DataFrame(data)
print(df)

# Drop rows with any missing values
df.dropna(inplace=True)
print(df)

# Fill missing values with a specific value
df.fillna(0, inplace=True)
print(df)

# Fill missing values using interpolation
df['A'] = df['A'].interpolate()
print(df)
Sorting Data:
# Sorting by a single column
df.sort_values(by='B', inplace=True)
print(df)

# Sorting by multiple columns
df.sort_values(by=['B', 'C'], ascending=[False, True], inplace=True)
print(df)

In [None]:
# Grouping by a single column and calculating the mean of each group
grouped_df = df.groupby('City')['Age'].mean()
print(grouped_df)

# Grouping by multiple columns and calculating the sum of each group
grouped_df = df.groupby(['City', 'Gender'])['Salary'].sum()
print(grouped_df)

In [None]:
# Creating two DataFrames
data1 = {'ID': [1, 2, 3, 4],
         'Name': ['Alice', 'Bob', 'Charlie', 'David']}
data2 = {'ID': [1, 2, 3, 5],
         'Age': [25, 30, 35, 40]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Merging DataFrames (inner join)
merged_df = pd.merge(df1, df2, on='ID', how='inner')
print(merged_df)

# Joining DataFrames (outer join)
joined_df = df1.join(df2.set_index('ID'), on='ID', how='outer')
print(joined_df)

In [None]:
# Creating a DataFrame with time series data
date_rng = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')
data = {'Date': date_rng,
        'Temperature': [15, 20, 22, 18, 25, 28, 30, 29, 27, 23]}
df = pd.DataFrame(data)
print(df)

# Setting the 'Date' column as the DataFrame's index
df.set_index('Date', inplace=True)

# Resampling time series data
weekly_mean = df.resample('W').mean()
print(weekly_mean)

# Rolling window operations
rolling_mean = df['Temperature'].rolling(window=3).mean()
print(rolling_mean)

https://medium.com/swlh/python-pandas-dataframe-tutorial-for-beginners-c59e8f206350

https://blog.devgenius.io/transforming-data-with-pandas-a-step-by-step-tutorial-e1a2e26d2c74