In [None]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load
school_data = Path("Resources/schools_complete.csv")
student_data = Path("Resources/students_complete.csv")

# Read School and Student Data Files and store into Pandas DataFrames
school_data_df = pd.read_csv(school_data)
student_data_df = pd.read_csv(student_data)

# Combine the school and student data files into one single dataset.  
school_data_merged = pd.merge(student_data_df, school_data_df, how="left", on=["school_name", "school_name"])

school_data_merged.head(3)

# Local Government Area (LGA) Summary
# Perform the necessary calculations and create the local government area's key metrics in a DataFrame.

Totalnumber_of_uniqueschools = len(school_data_merged["school_name"].unique())
Totalnumber_of_uniqueschools

Total_students = len(school_data_merged["school_name"])
Total_students

Total_budget = sum(school_data_merged["budget"].unique())
Total_budget

Average_maths_score = school_data_merged["maths_score"].mean()
Average_maths_score 

Average_reading_score = school_data_merged["reading_score"].mean()
Average_reading_score 

num_students_passing_maths = len((school_data_merged.loc[school_data_merged["maths_score"]>=50]))
num_students_passing_maths
percentage_students_passing_maths = (num_students_passing_maths/Total_students)*100
percentage_students_passing_maths

num_students_passing_reading = len((school_data_merged.loc[school_data_merged["reading_score"]>=50]))
num_students_passing_reading
percentage_students_passing_reading = (num_students_passing_reading/Total_students)*100
percentage_students_passing_reading

# filter students with overall passing grades first
overall_passing_df = school_data_merged.loc[(school_data_merged["maths_score"]>=50) & (
    school_data_merged["reading_score"]>=50), :]
overall_passing_df
# calculate the % of overall passing
percentage_students_passing_overall = (len(overall_passing_df)/Total_students)*100
percentage_students_passing_overall

In [None]:
# create data series for each year
# iloc will also allow for conditional statements to filter rows of data
# Using iloc on the logic test above only returns rows where the result is True
year_9 = school_data_merged[school_data_merged.iloc[:,3] == 9]
year_10 = school_data_merged[school_data_merged.iloc[:,3] == 10]
year_11 = school_data_merged[school_data_merged.iloc[:,3] == 11]
year_12 = school_data_merged[school_data_merged.iloc[:,3] == 12]
year_12.head(3)

# Convert the Series into DataFrame files if needed
# year_9_df = pd.DataFrame(year_9)

In [None]:
# display columns
school_data_merged.columns
# all calculations by school using school as index
# mergeddata_byschools = school_data_merged.set_index("school_name")
# mergeddata_byschools.head(3)

In [None]:
# Display an overview of the school column
school_data_merged['school_name'].value_counts()

In [None]:
# create a dataframe
# sort by overall passing head (4) and tail (4)

# create pandas series for maths scores for each year using conditional statement
data_series = pd.Series(["University of Melbourne", "UWA", "University of Sydney",
                         "La Trobe University", "University of Technology Sydney"])
# Group each series by school
# Combine the series into a dataframe
# Convert a single dictionary containing lists into a dataframe
pharaoh_df = pd.DataFrame(
    {"Dynasty": ["Early Dynastic Period", "Old Kingdom"],
     "Pharaoh": ["Thinis", "Memphis"]
     }
)
# Optional: give the displayed data cleaner formatting

# # create pandas series for reading scores for each year using conditional statement
# Group each series by school
# Combine the series into a dataframe
# Optional: give the displayed data cleaner formatting

# cut () budget bins show performance

In [None]:
# potencially useful functions

# Create a file path for the data. 
comics_path = Path("Resources/books_clean.csv")

# Read the modified Comic Books csv and store into Pandas DataFrame
comics_df = pd.read_csv(comics_path, encoding="utf-8")
comics_df.head()

# Reference a single column within a DataFrame
data_file_df["Amount"].head()

# Reference multiple columns within a DataFrame
data_file_df[["Amount", "Gender"]].head()

# The mean method averages the series
average = data_file_df["Amount"].mean()
average

total = data_file_df["Amount"].sum()
total

# The value_counts method counts unique values in a column
count = data_file_df["Gender"].value_counts()
count

# Calculations can also be performed on Series and added into DataFrames as new columns
thousands_of_dollars = data_file_df["Amount"]/1000
data_file_df["Thousands of Dollars"] = thousands_of_dollars

# Reorganising the columns using double brackets
organised_df = training_df[["Name","Trainer","Weight","Membership(Days)"]]

# Using .rename(columns={}) in order to rename columns
renamed_df = organised_df.rename(columns={"Membership(Days)":"Membership in Days", "Weight":"Weight in Pounds"})

# Export file as a CSV, without the Pandas index, but with the header
file_one_df.to_csv("Output/fileOne.csv", index=False, header=True)

# Push the remade DataFrame to a new CSV file
renamed_df.to_csv("Output/books_clean.csv",
                  encoding="utf-8", index=False, header=True)

# Calculate the number of unique authors in the DataFrame
author_count = len(comics_df["Author"].unique())

# Calculate the earliest/latest year a book was published
earliest_year = comics_df["Publication Year"].min()
latest_year = comics_df["Publication Year"].max()

# Place all of the data found into a summary DataFrame
summary_df = pd.DataFrame({"Total Unique Authors": [author_count],
                              "Total Unique Publication Countries": country_count,
                              "Earliest Year": earliest_year,
                              "Latest Year": latest_year})
summary_df

# Using loc to select all rows for `Common Name` and `Threatened Status`
df.loc[:, ["Common Name", "Threatened status"]].head()

# iloc will also allow for conditional statements to filter rows of data
# Using iloc on the logic test above only returns rows where the result is True
also_only_endangered = df[df.iloc[:,2] == "Endangered"]
also_only_endangered

# Multiple conditions can be set to narrow down or widen the filter
only_endangered_and_critical = df.loc[(df["Threatened status"] == "Endangered") | (
    df["Threatened status"] == "Critically Endangered"), :]
only_endangered_and_critical

# We only like good movies, so find those that scored over 7, and ignore the norm rating
good_movies_df = movie_file_df.loc[movie_file_df["IMDB"] > 7, [
    "FILM", "IMDB", "IMDB_user_vote_count"]]
good_movies_df.head()

# Finally, export this file to a spread so we can keep track of out new future watch list without the index
unknown_movies_df.to_excel("output/movieWatchlist.xlsx", index=False)

# Delete column we don't want
del employment_data_df['OCCUP_INDEX']

# Identify incomplete rows
employment_data_df.count()

# Drop all rows with missing information
employment_data_df = employment_data_df.dropna(how='any')

# The YEAR column is the wrong data type. It should be an integer.
employment_data_df.dtypes

# Use df.astype() method to convert the datatype of the YEAR column
employment_data_df = employment_data_df.astype({"YEAR": int}, errors='raise')

# Verify that the YEAR column datatype has been made an integer
employment_data_df['YEAR'].dtype

# Display an overview of the OCCUP column
employment_data_df['OCCUP'].value_counts()

# Clean up OCCUP category. Replace 'Laborer' with 'Labourer',
# 'Stone Mason' with 'Stonemason', 'Boot Maker' with 'Bootmaker'
# 'Coachtrimmer' with 'Coach Trimmer', and 'None' with 'None at present'
employment_data_df['OCCUP'] = employment_data_df['OCCUP'].replace({'Laborer': 'Labourer', 
                                   'Stone Mason': 'Stonemason', 
                                   'Boot Maker': 'Bootmaker',
                                   'Coachtrimmer': 'Coach Trimmer', 
                                   'None': 'None at present'})

# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
employment_data_df.describe()

# Fill NA values for the column "SURFACE_TYPE" with "Unknown"
road_stops_reduced = road_stops_reduced.fillna({"SURFACE_TYPE": "Unknown"})
road_stops_reduced

# Convert the lg_counts Series into a DataFrame
lg_inaccessible_stops_df = pd.DataFrame(lg_counts)

# Count how many inaccessible road stops are in each local government area
grouped_lg_df = road_stops_df.groupby(["LOCAL_GOVERNMENT_NAME"])

# Get the total "NUMBER_OF_BINS", "NUMBER_OF_TOILETS", "NUMBER_OF_TABLES".
grouped_lg_df[["NUMBER_OF_BINS", "NUMBER_OF_TOILETS", "NUMBER_OF_TABLES"]].sum()

# Save toilets and tables sums as series
lg_toilets = grouped_lg_df["NUMBER_OF_TOILETS"].sum()
lg_tables = grouped_lg_df["NUMBER_OF_TABLES"].sum()

# Create a new DataFrame using count and total toilets, and tables
lg_summary_df = pd.DataFrame({"Number of Road Stops with Accessibility Issues": lg_counts,
                              "Total Toilets": lg_toilets,
                              "Total Tables": lg_tables})

# It is also possible to group a DataFrame by multiple columns
# This returns an object with multiple indexes, however, which can be harder to deal with
grouped_lg_surface = road_stops_df.groupby(["LOCAL_GOVERNMENT_NAME","SURFACE"])

# Converting a GroupBy object into a DataFrame
lg_issues_df = pd.DataFrame(
    grouped_lg_surface[["NUMBER_OF_TOILETS", "NUMBER_OF_TABLES"]].sum())

# GroupBy is also useful for situations where you may want to calculate the average
amenities_df = road_stops_df[["LOCAL_GOVERNMENT_NAME", "NUMBER_OF_BINS", "NUMBER_OF_TOILETS", "NUMBER_OF_TABLES"]]
amenities_df.groupby(["LOCAL_GOVERNMENT_NAME"]).mean().head(10)

# Merge two DataFrames using an inner join
inner_merge_df = pd.merge(info_df, items_df, on="customer_id")
outer_merge_df = pd.merge(info_df, items_df, on="customer_id", how="outer")
left_merge_df = pd.merge(info_df, items_df, on="customer_id", how="left")
right_merge_df = pd.merge(info_df, items_df, on="customer_id", how="right")

# Drop NA values from the domestic health expenditure column
gdp_expenditure_df = gdp_expenditure_df.dropna(how="any", subset=["Domestic Health Expenditure (in millions of USD)"])

# Add a new column that calculates the percentage of GDP spent on domestic health
gdp_over_10bil_df["Health Expenditure (% GDP)"] = \
        gdp_over_10bil_df["Domestic Health Expenditure (in millions of USD)"] / \
        gdp_over_10bil_df["GDP (in millions of USD)"] * 100

# Sort the data by Health Expenditure (% GDP), Highest to Lowest
health_expenditure_sorted_df = gdp_over_10bil_df.sort_values(["Health Expenditure (% GDP)"],
                                           ascending=False)

# Reset Index
health_expenditure_sorted_df = health_expenditure_sorted_df.reset_index(drop=True)

# Print out the data for the highest percentage of GDP spent on domestic health
highest_health_expenditure = health_expenditure_sorted_df.loc[0, :]

# Bonus: Print out the data for the lowest % of GDP spent on domestic health with one line of code
health_expenditure_sorted_df.loc[len(health_expenditure_sorted_df)-1, :]

# Create the bins in which Data will be held
# Bins are 0, 59.9, 69.9, 79.9, 89.9, 100.   
bins = [0, 59.9, 69.9, 79.9, 89.9, 100]

# Create the names for the five bins
group_names = ["F", "D", "C", "B", "A"]

# Slice the data and place it into bins
test_scores_df["Test Score Summary"] = pd.cut(test_scores_df["Test Score"], bins, labels=group_names, include_lowest=True)

# Creating a group based off of the bins
test_scores_df = test_scores_df.groupby("Test Score Summary")
test_scores_df.max()

# Use Map to format all the columns
file_df["INCOME"] = file_df["INCOME"].map("${:,.2f}".format)
file_df["COSTS"] = file_df["COSTS"].map("${:,.2f}".format)
file_df["PERCENT30"] = (file_df["PERCENT30"]*100).map("{:.1f}%".format)
file_df["PERCENT3050"] = (file_df["PERCENT3050"]*100).map("{:.1f}%".format)
file_df["PERCENT50"] = (file_df["PERCENT50"]*100).map("{:.1f}%".format)
file_df["PERCENT_NODATA"] = (file_df["PERCENT_NODATA"]*100).map("{:.1f}%".format)
file_df["PERCENT_NOBURDEN"] = (file_df["PERCENT_NOBURDEN"]*100).map("{:.1f}%".format)
file_df["TOTAL"] = file_df["TOTAL"].map("{:,}".format)

# Collect only those projects that were hosted in Australia.

# Create a list of the columns
columns = ["name", "goal", "pledged", "outcome", 
    "country", "staff_pick", "backers_count", "spotlight"]

#  Create a new df for "AU" with the columns. 
hosted_in_aus_df = reduced_crowdfunding_df.loc[reduced_crowdfunding_df["country"] == "AU",  columns]

