# Introducing Pandas

- Sections

- Introduction to Pandas

- Creating and Loading Data

- Exploring DataFrames

- Data Cleaning

- Visualization with Pandas

- Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

# 1. Creating a DataFrame from a Dictionary

In [None]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [70000, 80000, 50000, 120000, 75000]
}
data_df = pd.DataFrame(data)
print("\n--- DataFrame ---\n\n")
data_df

# 2. Loading Data from a CSV file (example)

In [3]:
df = pd.read_csv('Boston.csv')

In [None]:
df.head()

In [5]:
df = pd.read_csv('Boston.csv', index_col=0)

## 2.1 Loading in .txt files

In [6]:
df_txt = pd.read_csv('Masses_V2_Table.txt', sep = ' ', index_col = 0)

In [None]:
df_txt.head()

# Adding in a row with some NaNs in the DataFrame for later analysis

In [8]:
new_row = pd.DataFrame(np.array([0.004, 12, 0, .432, 
              4.56, 54.1, 5.42, 4, 
              3, 210, np.nan, np.nan, np.nan]).reshape(1, -1), columns = df.columns.values, index = [3456453])

merge_df = pd.concat([df, new_row])

In [None]:
merge_df.tail()

# 3. Exploring the DataFrame

In [None]:
print("\nData Types:\n", df.dtypes)

In [None]:
print("\nSummary Statistics:\n")
df.describe()

In [None]:
print("\nFirst 3 Rows:\n")
df.head(3)

# 4. Data Cleaning (Handling Missing Values)

In [None]:
#code to determine if there are NaNs in the columns and how many are there
df.isna().sum()

In [None]:
merge_df.isna().sum()

In [None]:
#Ways to handle NaNs
#1. Drop the rows with NaNs
df_no_missing = merge_df.dropna()
df_no_missing.isna().sum()

In [None]:
#2. Fill NaNs with a specific value
df_fill = merge_df.fillna(0)
df_fill.isna().sum()

In [None]:
#3. Fill NaNs with the mean of the column
df_mean = merge_df.fillna(df.mean())
df_mean.isna().sum()

In [None]:
#4. Fill NaNs with the median of the column
df_median = merge_df.fillna(df.median())
df_median.isna().sum()

In [None]:
#5. Fill NaNs with the mode of the column
df_mode = merge_df.fillna(df.mode().iloc[0])
df_mode.isna().sum()

# 5. Filtering and Sorting

In [None]:
#Boolaen Masking to Filter the DataFrame
df_filtered = df[df['dis'] < 2]
print("\nFiltered Data (dis < 1):\n")
df_filtered

In [None]:
#Sorting the data by the column dis and it is in reverse order where the largest value is at the top
df.sort_values(by = 'dis', ascending = False)

In [None]:
#Sorting the data by the column dis and it is in ascending order where the smallest value is at the top
df.sort_values(by = 'dis', ascending = True)

# 6. Visualization with Pandas

In [None]:
df.plot(x='indus', y='crim', kind='scatter', title='Crime vs Industry')
plt.show()

In [None]:
df.plot(x='indus', y='crim', kind='scatter', title='Crime vs Industry')
plt.xlabel('Industry')
plt.ylabel('Crime')
plt.show()

In [25]:
features_df = pd.read_csv('Features_with_Continuum.txt', sep = ' ', index_col = 0)
predictors = pd.read_csv('Predictions_with_Continuum.txt', sep = ' ', index_col = 0)

In [None]:
features_df.head()

In [None]:
predictors.head()

In [28]:
good_fits_mask = features_df.chisq_phot < 50


EW_r_mask = predictors.EW_r.values < 500

total_mask = good_fits_mask & EW_r_mask

good_fits_data = features_df[total_mask]
y_pred = predictors[total_mask].EW_r



In [None]:
fig, axes = plt.subplots(2, 2, figsize = (10, 10))

ax = axes.flatten()

cols= ['burst', 'dust:Av', 'stellar_mass', 'sfr']

for column, a in zip(cols, ax):
    sb.boxplot(good_fits_data[column], ax = a)

plt.show()

# 7. Data Analysis

Part of our bread and butter is that we can take in a data set and learn things from it. This is the essence of data analysis where we use the data to uncover trends hidden within the data and we usually use plots and summary statistics to understand what the data is trying to tell us. We will cover some of the data analysis plots and techniques in the next few cells to familiarize yourself with what it means to analyze a data set.

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (10, 10))

ax = axes.flatten()

cols= ['burst', 'dust:Av', 'stellar_mass', 'sfr']

for column, a in zip(cols, ax):
    a.hist(good_fits_data[column], bins = 30, color = 'purple')
    a.set_xlabel(column)

ax[0].set_ylabel('Counts')
ax[-2].set_ylabel('Counts')

plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (10, 10))

ax = axes.flatten()

cols= ['burst', 'dust:Av', 'stellar_mass', 'sfr']

for column, a in zip(cols, ax):
    a.scatter(good_fits_data[column], y_pred, color = 'purple', alpha = 0.5, s = 10)
    a.set_xlabel(column)

ax[0].set_ylabel('EW_r')
ax[-2].set_ylabel('EW_r')


plt.show()

In [None]:
good_fits_data[['burst', 'dust:Av', 'stellar_mass', 'sfr']].plot(kind = 'box', 
                                                                 subplots = True, 
                                                                 layout = (2, 2), 
                                                                 figsize = (10, 10))
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sb.pairplot(good_fits_data[['burst', 'dust:Av', 'stellar_mass', 'sfr']], corner = True)
plt.show()

In [34]:

corr_matrix = good_fits_data[['burst', 'dust:Av', 'stellar_mass', 'sfr']].corr()

In [None]:
plt.figure(figsize = (10, 5))
sb.heatmap(corr_matrix, annot = True)
plt.show()