In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Navigate to the desired directory
import os
os.chdir('/content/drive/My Drive/')

# Verify the current working directory
!pwd


### Pandas Study Guide

### Introduction to Pandas DataFrame

A DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns). It is similar to a table in a database or an Excel spreadsheet. Each column in a DataFrame can be of a different data type (e.g., integers, floats, strings).

### Creating a Simple DataFrame

You can create a DataFrame by passing a dictionary of lists or a list of dictionaries to the `pd.DataFrame()` function.

In [None]:
import pandas as pd

# Creating a simple DataFrame using a dictionary of lists
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}

df = pd.DataFrame(data)
df

### Reading a CSV File

You can read a CSV file into a DataFrame using the `pd.read_csv()` function. Let's read the CSV file that was uploaded previously.

In [None]:
file_path = '/mnt/data/NYC_Census_Tracts_AddedCols.csv'
df = pd.read_csv(file_path)

df.head()

### Describing the DataFrame

The `info()` method provides a concise summary of the DataFrame, including the number of non-null entries, column names, data types, and memory usage.

In [None]:
# see the five top rows
df.head(5)

In [None]:
# see the last five rows
df.tail(5)

In [None]:
df.info()

### Viewing Values of One Column

You can access a single column of a DataFrame using the column name. For example, to view the values of the `Population_2000` column:

In [None]:
population_2000 = df['Population_2000']
population_2000

### Statistical Summary of One Column

The `describe()` method provides a statistical summary of a DataFrame column, including count, mean, standard deviation, minimum, maximum, and quartile values.

In [None]:
population_2000.describe()

### Basic Statistical Functions

Pandas provides several built-in functions for computing basic statistics on DataFrame columns, such as sum, mean, median, standard deviation, minimum, and maximum.

In [None]:
population_sum = population_2000.sum()
print(f"Sum: {population_sum}")

population_mean = population_2000.mean()
print(f"Mean: {population_mean}")

population_median = population_2000.median()
print(f"Median: {population_median}")

population_std = population_2000.std()
print(f"Standard Deviation: {population_std}")

population_min = population_2000.min()
print(f"Minimum: {population_min}")

population_max = population_2000.max()
print(f"Maximum: {population_max}")

### Pandas Study Guide: Subsetting and Adding Columns

### Getting a Subset of Columns

You can create a new DataFrame by selecting a subset of columns from an existing DataFrame. This is useful when you only need to work with certain columns of your data.

In [None]:
subset_df = df[['GISJOIN', 'Population_2000', 'NH_White_2000', 'NH_Black_2000']]
subset_df.head()

### Adding a New Column

You can add a new column to a DataFrame by simply assigning values to it. Often, the values of the new column are derived from existing columns.

### Example: Calculating a New Column

Suppose we want to calculate the percentage of the Non-Hispanic White population in 2000 relative to the total population in 2000. We can do this by dividing the `NH_White_2000` column by the `Population_2000` column and multiplying by 100.

In [None]:
df['Pct_NH_White_2000'] = (df['NH_White_2000'] / df['Population_2000']) * 100
df[['GISJOIN', 'Population_2000', 'NH_White_2000', 'Pct_NH_White_2000']]

In [None]:
df_subset = df[['GISJOIN', 'Population_2000', 'NH_White_2000', 'Pct_NH_White_2000']]
df_subset.head()

## Introduction to loc
The loc method is used to access a group of rows and columns by labels or a boolean array. It is primarily used for label-based indexing.



In [7]:
import pandas as pd

# Creating a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Score': [85, 90, 88, 92, 79]
}

df1 = pd.DataFrame(data)
df1


Unnamed: 0,Name,Age,City,Score
0,Alice,24,New York,85
1,Bob,27,Los Angeles,90
2,Charlie,22,Chicago,88
3,David,32,Houston,92
4,Eva,29,Phoenix,79


Selecting Rows Based on a Single Condition
You can use the loc method to select rows based on a single condition. For example, to select rows where the Age is greater than 25:

In [6]:
# Selecting rows where Age is greater than 25
subset_df = df1.loc[df1['Age'] > 25]
(subset_df)


Unnamed: 0,Name,Age,City,Score
1,Bob,27,Los Angeles,90
3,David,32,Houston,92
4,Eva,29,Phoenix,79


## Combining Conditions with and (&) and or (|)
You can combine multiple conditions using the & (and) and | (or) operators. When combining conditions, make sure to enclose each condition in parentheses.
### Example: Combining Conditions with and
To select rows where the Age is greater than 25 and the Score is greater than 80:

In [9]:
# Selecting rows where Age is greater than 25 and Score is greater than 80
subset_df_and = df1.loc[(df1['Age'] > 25) & (df1['Score'] > 80)]
subset_df_and


Unnamed: 0,Name,Age,City,Score
1,Bob,27,Los Angeles,90
3,David,32,Houston,92


## Example: Combining Conditions with or
To select rows where the Age is greater than 25 or the Score is greater than 80:

In [11]:
# Selecting rows where Age is greater than 25 or Score is greater than 80
subset_df_or = df1.loc[(df1['Age'] > 25) | (df1['Score'] > 80)]
subset_df_or


Unnamed: 0,Name,Age,City,Score
0,Alice,24,New York,85
1,Bob,27,Los Angeles,90
2,Charlie,22,Chicago,88
3,David,32,Houston,92
4,Eva,29,Phoenix,79
