# Pandas
Pandas is the most common library in Python to work with dataframes.

In [None]:
import pandas as pd

### Series
A Series is a one-dimensionalk labeled array capable of holding any data type.

In [None]:
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s)

In [None]:
s = pd.Series([10, 20, 30, 40])
print(s)

### Datagrame
A dataframe is a two-dimensional labeled data structure with columns of potentially different data types.

In [None]:
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 24, 35, 32],
        'City': ['New York', 'Paris', 'Berlin', 'London']}

df = pd.DataFrame(data)
print(df)

Basic operations with dataframes

In [None]:
# Viewing first n rows
df.head(2)

In [None]:
# Viewing last n rows
df.tail(2)

In [None]:
# Viewing summary information
df.info()

Descriptive statistics

In [None]:
df.describe()

Selection and indexing

In [None]:
# Selecting rows using .loc and .iloc
df.loc[0]   # Row by label

In [None]:
df.iloc[1]  # Row by position

In [None]:
# Selecting a single column
df['Name']

In [None]:
# Selecting multiple columns
df[['Name', 'City']]

Conditional selection

In [None]:
# Filter rows where Age > 30
df[df['Age'] > 30]

Modifying data

In [None]:
# Adding a new column 'Salary'
df['Salary'] = [50000, 60000, 45000, 70000]
print(df)

Updating or modifying values

In [None]:
# Updating the value of a single cell
df.at[0, 'City'] = 'San Francisco'
df

In [None]:
# Updating a column
df['Age'] = df['Age'] + 1
print(df)

Concatenate dataframes

In [None]:
data2 = {'Name': ['Dalya', 'Carlos', 'James', 'Camilla'],
        'Age': [28, 24, 35, 32],
        'City': ['Tokyo', 'Madrid', 'Oxford', 'Prague']}

df2 = pd.DataFrame(data2)
df2

In [None]:
df = pd.concat([df, df2])

In [None]:
df

Append row to dataframe

In [None]:
df = df._append({"Name": "Andres", "Age": 26, "City":"Bogota",
"Salary":450000}, ignore_index=True)
df = df._append({"Name": "Andres", "Age": 26, "City":"Bogota",
"Salary":450000}, ignore_index=True)

In [None]:
df

Handling duplicates

In [None]:
# Detecting duplicate rows
df.duplicated()

In [None]:
# Dropping duplicate rows
df = df.drop_duplicates()

In [None]:
df

Handling missing data

In [None]:
# Checking for missing data
df.isnull()

In [None]:
# Checking for non-missing data
df.notnull()

In [None]:
# Dropping rows with missing values
df =df.dropna()

In [None]:
df

In [None]:
# Filling missing values with a default value
df.fillna(0)

### Group by

In [None]:
# Grouping by 'City' and calculating mean age
df.groupby('City')['Age'].mean()

In [None]:
# Aggregating using sum, mean, and count
df.groupby('City').agg({'Age': ['mean', 'sum'], 'Salary': 'count'})

### Merge

In [None]:
# Merging two DataFrames on a common column
df3 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['John', 'Anna', 'Peter']})
df4 = pd.DataFrame({'ID': [1, 2, 3], 'Age': [28, 24, 35]})
pd.merge(df3, df4, on='ID')

### Date conversion

In [None]:
# Converting a column to datetime
df['Date'] = pd.to_datetime(['2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01','2021-05-01'])
print(df)

In [None]:
# Extracting year, month, and day from Date column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
print(df)

### Visualization with Pandas

In [None]:
# Plotting a simple line chart
df.plot(x='Name', y='Age', kind='bar')

### Exporting and importing data

In [None]:
# Reading a CSV file
df = pd.read_csv('data.csv')

# Reading an Excel file
df = pd.read_excel('data.xlsx')

In [None]:
df.to_csv('output.csv', index=False)

# Saving DataFrame to Excel
df.to_excel('output.xlsx', index=False)

### Pivot tables
Pivot tables allow for re-shaping and summarizing data.

In [None]:
# Pivot table example
df.pivot_table(values='Age', index='City', aggfunc='mean')

### Melt (Unpivot)
Reshaping DataFrames from wide to long format using melt()

In [None]:
# Example of melting data
df_melted = pd.melt(df, id_vars=['Name'], value_vars=['Age', 'City'])
df_melted

### Vectorization

In [None]:
# Example of vectorized operation
df.loc[:,'Age_plus_10'] = df['Age'] + 10

In [None]:
df

In [None]:
# Applying a lambda function to a column
df.loc[:,'Age_squared'] = df['Age'].apply(lambda x: x ** 2)

In [None]:
df

### Working with large datasets
Reading large datasets in chunks to save memory using chunksize.

In [None]:
# Reading a large file in chunks
chunk_iter = pd.read_csv('large_file.csv', chunksize=1000)
for chunk in chunk_iter:
    process(chunk)  # Custom processing function