# Introduction to Pandas
Pandas is a powerful library for data manipulation and analysis. It is built on top of NumPy and provides data structures and functions needed to work on structured data seamlessly.

In [1]:
# Importing Pandas
import pandas as pd
print(pd.__version__)

# Data Structures: Series and DataFrame
Pandas has two primary data structures:
- **Series**: A one-dimensional labeled array capable of holding any data type.
- **DataFrame**: A two-dimensional labeled data structure with columns of potentially different types.

In [2]:
# Creating a Series
s = pd.Series([1, 2, 3, 4, 5])
print(s)
print(s.index, s.values, s.dtype)

In [3]:
# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df = pd.DataFrame(data)
print(df)
print(df.index, df.columns, df.values, df.shape, df.dtypes)

# Data Indexing and Selection
Pandas provides powerful indexing and selection capabilities. You can select data by label or position, and use boolean indexing to filter data.

In [4]:
# Indexing and Slicing
print(df.loc[0])  # By label
print(df.iloc[0])  # By position
print(df.loc[:, 'Name'])  # All rows, single column
print(df.iloc[:, 0])  # All rows, single column by position
print(df.loc[0, 'Name'])  # Single value by label
print(df.iloc[0, 0])  # Single value by position

In [5]:
# Boolean Indexing
print(df[df['Age'] > 30])

In [6]:
# Setting and Resetting Index
df.set_index('Name', inplace=True)
print(df)
df.reset_index(inplace=True)
print(df)

# Data Manipulation
Pandas makes it easy to manipulate data. You can add or modify columns, and drop unnecessary columns or rows.

In [7]:
# Adding and Modifying Columns
df['Salary'] = [50000, 60000, 70000]
print(df)
df['Age'] += 1
print(df)

In [8]:
# Dropping Columns and Rows
df.drop('Salary', axis=1, inplace=True)
print(df)
df.drop(0, axis=0, inplace=True)
print(df)

In [9]:
# Renaming Columns and Index
df.rename(columns={'Age': 'Years'}, inplace=True)
print(df)

# Handling Missing Data
Handling missing data is crucial in data analysis. Pandas provides functions to detect, fill, and drop missing data.

In [10]:
# Detecting Missing Data
df.loc[2, 'Years'] = None
print(df.isnull())
print(df.notnull())

In [11]:
# Filling Missing Data
df.fillna(0, inplace=True)
print(df)

In [12]:
# Dropping Missing Data
df.dropna(inplace=True)
print(df)

# Data Aggregation and Grouping
Pandas provides powerful aggregation functions to summarize data. Grouping and pivot tables are key features for data analysis.

In [13]:
# Grouping Data
grouped = df.groupby('Years')
print(grouped.mean())

In [14]:
# Aggregation Functions
print(grouped.agg(['sum', 'mean']))

In [15]:
# Pivot Tables
df['Salary'] = [50000, 60000, 70000]
print(df.pivot_table(values='Salary', index='Years', columns='Name', aggfunc='mean'))

# Merging and Joining DataFrames
Combining DataFrames is essential in data analysis. Pandas provides several functions to merge and join DataFrames.

In [16]:
# Concatenation
df1 = pd.DataFrame({'A': ['A0', 'A1'], 'B': ['B0', 'B1']})
df2 = pd.DataFrame({'A': ['A2', 'A3'], 'B': ['B2', 'B3']})
result = pd.concat([df1, df2])
print(result)

In [17]:
# Merging
left = pd.DataFrame({'key': ['K0', 'K1'], 'A': ['A0', 'A1']})
right = pd.DataFrame({'key': ['K0', 'K2'], 'B': ['B0', 'B2']})
result = pd.merge(left, right, on='key', how='inner')
print(result)

In [18]:
# Joining
left = pd.DataFrame({'A': ['A0', 'A1']}, index=['K0', 'K1'])
right = pd.DataFrame({'B': ['B0', 'B1']}, index=['K0', 'K2'])
result = left.join(right, how='inner')
print(result)

# Time Series Data
Pandas has robust support for time series data, including date range generation, indexing, and resampling.

In [19]:
# Date Range Generation
date_range = pd.date_range(start='2020-01-01', end='2020-01-10', freq='D')
print(date_range)

In [20]:
# Indexing with Time Series
df['Date'] = pd.date_range(start='2020-01-01', periods=5, freq='D')
df.set_index('Date', inplace=True)
print(df)

In [21]:
# Resampling Time Series Data
df['Value'] = [1, 2, 3, 4, 5]
print(df.resample('2D').mean())

# Input and Output Operations
Pandas can read and write data to various file formats, making it easy to handle data from multiple sources.

In [22]:
# Reading Data from Various Formats
# Uncomment and replace 'file.csv', 'file.xlsx', etc. with actual file paths or data sources
# df_csv = pd.read_csv('file.csv')
# df_excel = pd.read_excel('file.xlsx')
# df_sql = pd.read_sql('SELECT * FROM table', con)
# df_json = pd.read_json('file.json')
# print(df_csv, df_excel, df_sql, df_json)

In [23]:
# Writing Data to Various Formats
# Uncomment and replace 'file.csv', 'file.xlsx', etc. with actual file paths or destinations
# df.to_csv('file.csv')
# df.to_excel('file.xlsx')
# df.to_sql('table', con)
# df.to_json('file.json')

# Visualization with Pandas
Pandas integrates with Matplotlib to provide basic plotting capabilities, making it easy to visualize data.

In [24]:
# Basic Plotting
df.plot(kind='line')
df.plot(kind='bar')

In [25]:
# Advanced Plotting with Matplotlib
import matplotlib.pyplot as plt
df.plot(kind='line')
plt.title('Line Plot')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()

# Titanic Dataset Analysis
Now, let's apply what we've learned to the Titanic dataset.

In [26]:
# Load the Titanic dataset
titanic = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')
print(titanic.head())

### Data Indexing and Selection

In [27]:
# Selecting specific columns
print(titanic[['Name', 'Age']].head())

In [28]:
# Filtering data
print(titanic[titanic['Age'] > 30].head())

### Data Manipulation

In [29]:
# Adding a new column
titanic['Age_in_10_Years'] = titanic['Age'] + 10
print(titanic.head())

In [30]:
# Dropping columns
titanic.drop('Age_in_10_Years', axis=1, inplace=True)
print(titanic.head())

### Handling Missing Data

In [31]:
# Detecting missing data
print(titanic.isnull().sum())

In [32]:
# Filling missing data
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
print(titanic.isnull().sum())

### Data Aggregation and Grouping

In [33]:
# Grouping by 'Pclass' and calculating mean age
grouped = titanic.groupby('Pclass')['Age'].mean()
print(grouped)

### Merging and Joining DataFrames

In [34]:
# Creating additional DataFrames for merging
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
merged = pd.merge(df1, df2, on='key', how='inner')
print(merged)

### Time Series Data

In [35]:
# Converting 'Date' column to datetime and setting as index
titanic['Date'] = pd.to_datetime(titanic['Name'].apply(lambda x: ' '.join(x.split()[-2:])), errors='coerce')
titanic.set_index('Date', inplace=True)
print(titanic.head())

In [36]:
# Resampling data
print(titanic.resample('M').mean())

### Input and Output Operations

In [37]:
# Saving the DataFrame to a CSV file
titanic.to_csv('titanic_cleaned.csv')
print('DataFrame saved to titanic_cleaned.csv')

### Visualization with Pandas

In [38]:
# Basic plotting
titanic['Age'].plot(kind='hist', title='Age Distribution')
plt.show()

In [39]:
# Advanced plotting
titanic['Age'].plot(kind='box', title='Age Boxplot')
plt.show()