<center><img src="https://upload.wikimedia.org/wikipedia/en/thumb/0/0b/The_Hartford_Financial_Services_Group_logo.svg/1920px-The_Hartford_Financial_Services_Group_logo.svg.png" style="width:200px; height:200px;"></center>

# 1. Data Cleaning

In [None]:
# Import Libraries
import pandas as pd
from pathlib import Path

In [None]:
csv_path = Path('05-Stu_Data_Cleaning/Resources/stock_data.csv')

### Load CSV data into Pandas using read_csv

In [None]:
csv_data = pd.read_csv(csv_path)

### Identify the number of rows and columns (shape) in the DataFrame.

In [None]:
csv_data.shape

In [None]:
nrow = csv_data.shape[0]
ncols = csv_data.shape[1]

print(f"Number of Rows: {nrow}")
print(f"Number of Columns: {ncols}")

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [None]:
csv_data.head()

In [None]:
csv_data.sample(5)

### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [None]:
csv_data.count()

In [None]:
csv_data.info()

### Identify nulls records

In [None]:
csv_data.isnull().sum()

In [None]:
csv_data.isnull().sum().sum()

In [None]:
csv_data.isnull().mean() * 100

In [None]:
# example
(12/504) * 100

### Drop Bull Records

In [None]:
# the two options produce the same results 
csv_data.dropna(inplace=True)

# csv_data = csv_data.dropna()

### Validate nulklks have been dropped

In [None]:
csv_data.isnull().sum()

In [None]:
csv_data.isnull().sum().sum()

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [None]:
csv_data['ebitda'] = csv_data['ebitda'].fillna(0)
csv_data['ebitda'].isnull().sum()

### Drop Duplicates

In [None]:
# the two options produce the same results
csv_data.drop_duplicates(inplace=True)

# csv_data = csv_data.drop_duplicates()

In [None]:
csv_data.shape

### Sample `price` field

In [None]:
csv_data['price'].sample(5)

In [None]:
csv_data['price'].head()

### Clean price Series by replacing $

In [None]:
csv_data['price'] = csv_data['price'].str.replace('$', '')
csv_data['price']

### Confirm data type of `price`

In [None]:
csv_data['price'].dtype

### Cast price Series as float

In [None]:
csv_data['price'] = csv_data['price'].astype('float')
csv_data['price'].dtype

In [None]:
csv_data.info()

In [None]:
csv_data.head()

### List Sectors in the Dataset

In [None]:
csv_data['sector'].unique()

In [None]:
csv_data['sector'].value_counts()

In [None]:
csv_data['sector'].value_counts(normalize=True)

In [None]:
csv_data['sector'].value_counts().plot(kind='bar')

### What is the average, min, and max earning per share by sector

In [None]:
csv_data['earnings_per_share'] = csv_data['earnings_per_share'].str.replace('$', '').astype('float')

In [None]:
csv_data.groupby(by='sector').agg({
    'earnings_per_share': ['mean', 'min', 'max']
})

In [None]:
csv_data.groupby(by='sector').agg({
    'earnings_per_share': ['mean', 'min', 'max']
}).to_csv('stock_analysis.csv')

In [None]:
csv_data_summary = csv_data.groupby(by='sector').agg({
    'earnings_per_share': ['mean', 'min', 'max']
})

In [None]:
csv_data_summary.to_csv('stock_analysis.csv')

In [None]:
csv_data_summary.to_excel('stock_analysis.xlsx')

# 2. Crowd Funding Clean

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# The path to our CSV file
file_path = Path("06-Stu_CleaningCrowdfunding/Unsolved/Resources/CrowdfundingData.csv")

# Read our Crowdfunding data into pandas
df = pd.read_csv(file_path)
df.head()

In [None]:
df.shape

In [None]:
# Get a list of all of our columns for easy reference
df.columns

In [None]:
# Extract "name", "goal", "pledged", "outcome", "country", "staff_pick",
# "backers_count", and "spotlight"
reduced_crowdfunding_df = df.loc[:, ["name", "goal", "pledged",
                                    "outcome", "country", "staff_pick", "backers_count", "spotlight"]]
reduced_crowdfunding_df

In [None]:
cols = ["name", "goal", "pledged", "outcome", "country", "staff_pick", "backers_count", "spotlight"]

reduced_crowdfunding_df = df[cols]

reduced_crowdfunding_df

In [None]:
# Remove projects that made no money at all
reduced_crowdfunding_df = reduced_crowdfunding_df.loc[(
    reduced_crowdfunding_df["pledged"] > 0)]


reduced_crowdfunding_df.head()

In [None]:
reduced_crowdfunding_df.shape

In [None]:
# Remove projects that made no money at all
reduced_crowdfunding_df = reduced_crowdfunding_df[(
    reduced_crowdfunding_df["pledged"] > 0)]


reduced_crowdfunding_df.head()

In [None]:
reduced_crowdfunding_df.shape

In [None]:
# Collect only those projects that were hosted in the US.

# Create a list of the columns
cols = [
    "name", "goal", "pledged", "outcome", 
    "country", "staff_pick", "backers_count", "spotlight"]

#  Create a new df for "US" with the columns. 

hosted_in_us_df = reduced_crowdfunding_df.loc[reduced_crowdfunding_df["country"] == "US",  cols]

hosted_in_us_df.head()

In [None]:
# Create a new column that finds the average amount pledged to a project
average_donation = hosted_in_us_df['pledged'] / hosted_in_us_df['backers_count']
average_donation

In [None]:
# Create a new column that finds the average amount pledged to a project
hosted_in_us_df["average_donation"] = hosted_in_us_df['pledged'] / \
    hosted_in_us_df['backers_count']

In [None]:
hosted_in_us_df["average_donation"] = hosted_in_us_df["average_donation"].astype(float).map(
    "${:,.2f}".format)
hosted_in_us_df["goal"] = hosted_in_us_df["goal"].astype(float).map("${:,.2f}".format)
hosted_in_us_df["pledged"] = hosted_in_us_df["pledged"].astype(float).map("${:,.2f}".format)

In [None]:
hosted_in_us_df.head()

In [None]:
# Calculate the total number of backers for all US projects
hosted_in_us_df["backers_count"].sum()

In [None]:
# Calculate the average number of backers for all US projects
hosted_in_us_df["backers_count"].mean()

In [None]:
# Collect only those US campaigns that have been picked as a "Staff Pick"
picked_by_staff_df = hosted_in_us_df.loc[hosted_in_us_df["staff_pick"] == True]
picked_by_staff_df.head()

In [None]:
# Group by the outcome of the campaigns and see if staff picks matter (Seems to matter quite a bit)

outcome_groups = picked_by_staff_df.groupby("outcome")
outcome_groups["name"].count()

In [None]:
picked_by_staff_df['outcome'].value_counts()

# 3. Group Dynamics

In [None]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using read_csv, as well as arguments index_col, parse_dates, and infer_datetime_format

In [None]:
# Read in CSV file
csv_path = Path("06-Stu_Groupby//Resources/crypto_data.csv")
crypto_data = pd.read_csv(csv_path, index_col='data_date', parse_dates=True, infer_datetime_format=True)
crypto_data.head()

In [None]:
crypto_data = pd.read_csv(csv_path, index_col='data_date', parse_dates=True)
crypto_data.head()

In [None]:
crypto_data = pd.read_csv(csv_path, index_col='data_date', parse_dates=['data_date'])
crypto_data.head()

### Clean data

In [None]:
# Drop extraneous columns
# using inplace=True
crypto_data.drop(columns=['data_time','timestamp'], inplace=True)
crypto_data.dropna(inplace=True)

# overwriting manually 
# crypto_data = crypto_data.drop(columns=['data_time','timestamp'])
# crypto_data = crypto_data.dropna()
crypto_data.head()

### Group data by cryptocurrency and plot on the same chart¶

In [None]:
crypto_data.groupby('cryptocurrency')['data_priceUsd'].plot(legend=True)

In [None]:
crypto_data_plot = crypto_data.groupby('cryptocurrency')['data_priceUsd'].plot(legend=True);

In [None]:
crypto_by_type = crypto_data.groupby('cryptocurrency')['data_priceUsd']

In [None]:
crypto_by_type.plot(legend=True, figsize=(12,6));

In [None]:
crypto_by_type.plot(legend=True, figsize=(12,6), title='Crypto Currency Performance', xlabel='Date', ylabel='Price USD');

In [None]:
crypto_by_type.plot(legend=True, figsize=(12,6), title='Crypto Currency Performance', xlabel='Date', ylabel='Price USD', style='--');

In [None]:
import matplotlib.pyplot as plt

# list of available styles can be found here https://matplotlib.org/stable/gallery/style_sheets/style_sheets_reference.html

plt.style.use('ggplot')

crypto_by_type.plot(legend=True, figsize=(12,6), title='Crypto Currency Performance', xlabel='Date', ylabel='Price USD', style='-');


### Calculate average price across two years for each cryptocurrency

In [None]:
crypto_data.index

In [None]:
crypto_data.index.year

In [None]:
crypto_data.index.year.unique()

In [None]:
# Determine average price across two years
crypto_data_avg = crypto_data.groupby('cryptocurrency')['data_priceUsd'].mean()
crypto_data_avg

### Calculate max price across two years for each cryptocurrency

In [None]:
crypto_data_max = crypto_data.groupby('cryptocurrency')['data_priceUsd'].max()
crypto_data_max

### Calculate min price across two years for each cryptocurrency

In [None]:
crypto_data_min = crypto_data.groupby('cryptocurrency')['data_priceUsd'].min()
crypto_data_min

# 4. Search for The Worst

In [None]:
# Import Dependencies
import pandas as pd

In [None]:
# Create reference to CSV file
csv_path = "09-Stu_SearchForTheWorst/Unsolved/Resources/SFO_Airport_Utility_Consumption.csv"

# Import the CSV into a pandas DataFrame
consumption_df = pd.read_csv(csv_path)
consumption_df

In [None]:
# Collect a list of all the unique values in "Utility"
consumption_df["Utility"].unique()

In [None]:
# Looking only at Electricity Consumption with "Tenant" owner
electricity_df = consumption_df.loc[(consumption_df["Utility"] == "Electricity") &
                                    (consumption_df["Owner"] == "Tenant"), :]
electricity_df.head()

In [None]:
# Sort the DataFrame by the values in the "Usage" column to find the worst day
electricity_df = electricity_df.sort_values("Usage", ascending=False)

# Reset the index so that the index is now based on the sorting locations
electricity_df = electricity_df.reset_index(drop=True)

electricity_df.head()

In [None]:
# Save all of the information collected on the worst day
worst_day = electricity_df.loc[0, :]
worst_day

# 5. Comic Books CSV

In [None]:
# Import Dependencies
import pandas as pd

In [None]:
# Make a reference to the comic_books_expanded.csv file path
csv_path = "10-Stu_ComicBooksCSV/Unsolved/Resources/comic_books_expanded.csv"

# Import the comic_books_expanded.csv file as a DataFrame
books_df = pd.read_csv(csv_path, encoding="utf-8")
# Check the special characters imported correctly
books_df.tail()

In [None]:
# Remove unecessary columns from the DataFrame and save the new DataFrame
# Only keep: "ISBN", "Title", "Other titles", "Name", "All names", 
# "Country of publication", "Place of publication", "Publisher", "Date of publication"
reduced_df = books_df[["ISBN", "Title", "Other titles", "Name", "All names", 
                       "Country of publication", "Place of publication", 
                       "Publisher", "Date of publication"]]
reduced_df.head()

In [None]:
# Rename the headers to be more explanatory
renamed_df = reduced_df.rename(columns={"Other titles": "Other Titles",
                                        "Name": "Author",
                                        "All names": "All Names",
                                        "Country of publication": "Country of Publication",
                                        "Place of publication": "Place of Publication",
                                        "Date of publication": "Publication Year", })
renamed_df.head()

In [None]:
reduced_df.columns

In [None]:
cols = ['ISBN', 'Title', 'Other titles', 'Name', 'All names',
       'Country of publication', 'Place of publication', 'Publisher',
       'Date of publication']

reduced_df.columns = cols

In [None]:
# Push the remade DataFrame to a new CSV file
renamed_df.to_csv("10-Stu_ComicBooksCSV/Unsolved/Output/books_clean.csv",
                  encoding="utf-8", index=False, header=True)

# 6. Comic Books Summary

In [None]:
# Import Dependencies
import pandas as pd

# File to Load
comics_path = "11-Stu_ComicBooksSummary/Unsolved/Resources/books_clean.csv"

# Read the modified Comic Books csv and store into Pandas DataFrame
comics_df = pd.read_csv(comics_path, encoding="utf-8")
comics_df.head()

In [None]:
# Calculate the number of unique authors in the DataFrame
author_count = len(comics_df["Author"].unique())

# Calculate the number of unique publication countries in the DataFrame
country_count = len(comics_df["Country of Publication"].unique())

# Calculate the earliest/latest year a book was published
earliest_year = comics_df["Publication Year"].min()
latest_year = comics_df["Publication Year"].max()

In [None]:
# Place all of the data found into a summary DataFrame
summary_df = pd.DataFrame({"Total Unique Authors": [author_count],
                              "Total Unique Publication Countries": country_count,
                              "Earliest Year": earliest_year,
                              "Latest Year": latest_year})
summary_df

# 7. Mastering Concatenation

In [None]:
import pandas as pd
from pathlib import Path, PurePath

directory = Path('13-Stu_Concat_Dataframes/Resources/')



fin_leaders_america_path = directory / 'fin_leaders_america.csv'
investors_leadership_path = directory / 'invstrs_leadership.csv'
fin_leaders_mem_path = directory / 'fin_leaders_members.csv'
investors_leadership_mem_path = directory / 'invstrs_leadership_members.csv'

fin_leaders_dues = pd.read_csv(fin_leaders_america_path)
investors_leadership_dues = pd.read_csv(investors_leadership_path)
fin_leaders_mbr_status = pd.read_csv(fin_leaders_mem_path)
investors_leadership_mbr_status = pd.read_csv(investors_leadership_mem_path)

### Concat dues data using rows axis and inner join¶

In [None]:
combined_dues = pd.concat([fin_leaders_dues,investors_leadership_dues], axis='rows', join='inner')
combined_dues

In [None]:
combined_dues = pd.concat([fin_leaders_dues,investors_leadership_dues], axis=0, join='inner')
combined_dues

### Concat member data using rows axis and inner join

In [None]:
combined_members = pd.concat([fin_leaders_mbr_status,investors_leadership_mbr_status], axis='rows', join='inner')
combined_members

### Concat dues and member data using columns axis and inner join

In [None]:
dues_member_status = pd.concat([combined_dues.drop_duplicates(),combined_members.drop_duplicates()], axis='columns', join='inner')
dues_member_status