# **Key Terms**

> **DataFrame -** A 2 dimensional labeled data structure with columns of potentially different types. Like a spreadsheep or SQL table

In [2]:
import pandas as pd

# Create a DataFrame from dictionaries
data = {'names': ['Pirate', 'Bum','Pothead'],
        'ages': [400, 44, 45]}
df = pd.DataFrame(data)

print(df)

     names  ages
0   Pirate   400
1      Bum    44
2  Pothead    45


## **Column -**
> A vertical set of values in a DataFrame.Each column has a name and contains values of same data type.

In [4]:
# Access the 'name' column
print(df['names'])

0     Pirate
1        Bum
2    Pothead
Name: names, dtype: object


## **Row -**
> A horizontal entry in a DataFrame. Each row contains an observation with values for each column.

In [5]:
# Access the first row
print(df.iloc[2])

names    Pothead
ages          45
Name: 2, dtype: object


## **Iloc -**

> interger-location based indexer to select DataFrame rows and columns by index.


In [6]:
# Select rows 0 and 1
print(df.iloc[[0,1]])

    names  ages
0  Pirate   400
1     Bum    44


## **loc -**

> Label-location based indexer to select DataFrame rows and columns by columns name.

In [8]:
# Select rows by condition
print(df.loc[df['ages'] > 44])

     names  ages
0   Pirate   400
2  Pothead    45


### **Comparison operators -**
> Operatores like == , !=, <,>etc. used to compare values and return Boolean true/false results. Important for filtering data.

### **Boolean operators -**
> Operators like & , |, ~ used to combine comparison expressions and return true/false results. Important for comples filter logic.

### **Filters -**
> Boolean indexed arrays that allow selecting subsets of data meeting comparison criteria. Created using comparison/Boolean operators.

### **loc accessor -**
> Accessor used to select/filter data from a DataFrame by label or Boolean array.

### **isin() method -**
> Method to filter data using a list of values to check for set membership. Useful alternative to repeated equality checks.

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})


   A  B
1  2  5


In [4]:
# Comparison operators 
print(df[df['A'] == 2]) # Simple filter

   A  B
1  2  5


In [5]:
# Boolean operators
filter = (df['A'] > 1) & (df['B'] == 6)  
print(df[filter]) # Complex filter

   A  B
2  3  6


In [6]:
# Filters with loc accessor
filter = df['A'] > 1
print(df.loc[filter]) 

   A  B
1  2  5
2  3  6


In [7]:

# isin() method
values = [2, 5]
filter = df['A'].isin(values) 
print(df[filter])

   A  B
1  2  5


### **numpy.ndarray**
> *  n-dimensional Numpy array, the core object that under pins Pandas DataFrames

### **pyspark.sql.DataFrame**
> * Distributed dataframe object in PySpark for working with large datasets

### **dask.dataframe.DataFrame**
> * Dask dataframe that partitions Pandas dataframe across multiple cores

### **np.random.randn()** 
> * Generate an ndarray with random values from a normal distribution

### **df.persist()**
> * Trigger caching of a PySpark dataframe into memory or disk storage



In [2]:
# Pandas Code Examples
import pandas as pd
import numpy as np

In [3]:

# numpy.ndarray
data = np.random.randn(5, 4)  # Generate 5x4 ndarray 
df = pd.DataFrame(data) # Underlying structure of Pandas DataFrame
print(df)

          0         1         2         3
0  0.632713 -0.480907  0.450839  0.711421
1 -0.816838 -1.013212  0.017223  1.213634
2 -0.058457  1.311762  0.692476 -1.105042
3  0.638599  1.996523  0.613670  0.171192
4 -0.446314  0.983192  0.209889 -0.336484


In [5]:
# np.random.randn()  
df = pd.DataFrame(np.random.randn(5, 4), 
                  columns=['A_column', 'B_column', 'C_ya', 'DeezNuts']) 
print(df)

   A_column  B_column      C_ya  DeezNuts
0  0.978081 -0.751951  0.149860 -1.361199
1  0.994186  2.500617  2.307605  1.032525
2 -0.986731  1.435462  0.777692 -0.106692
3 -0.297237 -0.537374  0.327383  0.243513
4 -0.460401 -0.037491  0.161893 -0.646683


In [2]:
'''
 POLARS Key Points:

Polars supports reading/writing many data formats like CSV and Parquet

Expressions allow modular DataFrame transformations

Filter, select, group_by, join, and concat combine to form complex workflows
'''

import polars as pl

# Create DataFrame 
data = [{"fruit": "apple", "count": 10, "price": 0.50}, 
        {"fruit": "banana", "count": 20, "price": 0.25}]
df = pl.from_dicts(data)

# Expressions to select, filter, aggregate
sel = df.select(["fruit", "count"]) # Select columns
filt = sel.filter(pl.col("fruit") == "apple") # Filter rows
agg = filt.group_by("fruit").agg(pl.col("count").sum()) # Aggregate

print(agg)

shape: (1, 2)
┌───────┬───────┐
│ fruit ┆ count │
│ ---   ┆ ---   │
│ str   ┆ i64   │
╞═══════╪═══════╡
│ apple ┆ 10    │
└───────┴───────┘


In [4]:
# code hint using Polars
# Join DataFrames
joined = df.join(df2, on=["fruit_name"]) 

# Visualize
joined.plot.scatter(x="count", y="price")

NameError: name 'df2' is not defined