In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'category': ['A', 'B', 'A', 'C'],'value': [10, 20, 15, 30]})
print(df)
print("\n")
df['category'] = df['category'].astype('category') # Optimized memory usage
print(df.dtypes)

  category  value
0        A     10
1        B     20
2        A     15
3        C     30


category    category
value          int64
dtype: object


In [None]:
#Vectorized Operations vs. Apply
#Use vectorized operations instead of apply() or loops for better performance.
df['new_value'] = df['value'] * 2 # Vectorized operation

print(df)

#Avoid apply() for row-wise operations unless necessary.
df['new_value'] = df.apply(lambda row: row['value'] * 2, axis=1) # Slower
print(df)

  category  value  new_value
0        A     10         20
1        B     20         40
2        A     15         30
3        C     30         60
  category  value  new_value
0        A     10         20
1        B     20         40
2        A     15         30
3        C     30         60


In [None]:
#eval() and query() offer faster alternatives to standard pandas operations.
df.eval('new_value = value * 2', inplace=True) # Faster than apply
print(df)

df_filtered = df.query('value > 15') # Efficient filtering
print(df)

  category  value  new_value
0        A     10         20
1        B     20         40
2        A     15         30
3        C     30         60
  category  value  new_value
0        A     10         20
1        B     20         40
2        A     15         30
3        C     30         60


In [None]:
import pandas as pd

# Define the arrays for MultiIndex
arrays = [
    ['A', 'A', 'B', 'B'],
    [1, 2, 1, 2]
]

# Create a MultiIndex from the arrays with named levels
index = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Num'))

# Create the DataFrame with the MultiIndex
df = pd.DataFrame({'Values': [10, 20, 30, 40]}, index=index)

# Print the DataFrame
print(df)


           Values
Group Num        
A     1        10
      2        20
B     1        30
      2        40


In [None]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B'],
    'Type': ['X', 'Y', 'X', 'Y'],
    'Value': [10, 20, 15, 25]
})

# Create a pivot table
pivot_df = df.pivot_table(values='Value', index='Category', columns='Type', aggfunc='sum')

# Print the pivot table
print(pivot_df)


Type       X   Y
Category        
A         10  20
B         15  25


In [None]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B'],
    'Type': ['X', 'Y', 'X', 'Y'],
    'Value': [10, 20, 15, 25]
})

# Compute rolling average with a window size of 2
df['rolling_avg'] = df['Value'].rolling(window=2).mean()

# Print the DataFrame with rolling average
print("Rolling Average:")
print(df)

# Compute expanding sum
df['expanding_sum'] = df['Value'].expanding().sum()

# Print the DataFrame with expanding sum
print("\nExpanding Sum:")
print(df)


Rolling Average:
  Category Type  Value  rolling_avg
0        A    X     10          NaN
1        A    Y     20         15.0
2        B    X     15         17.5
3        B    Y     25         20.0

Expanding Sum:
  Category Type  Value  rolling_avg  expanding_sum
0        A    X     10          NaN           10.0
1        A    Y     20         15.0           30.0
2        B    X     15         17.5           45.0
3        B    Y     25         20.0           70.0


In [None]:
#Use chunksize when reading large files.
chunks = pd.read_csv('/content/car_price_dataset.csv', chunksize=100)
for chunk in chunks:
  print(chunk.head(80))

         Brand   Model  Year  Engine_Size Fuel_Type    Transmission  Mileage  \
0          Kia     Rio  2020          4.2    Diesel          Manual   289944   
1    Chevrolet  Malibu  2012          2.0    Hybrid       Automatic     5356   
2     Mercedes     GLA  2020          4.2    Diesel       Automatic   231440   
3         Audi      Q5  2023          2.0  Electric          Manual   160971   
4   Volkswagen    Golf  2003          2.6    Hybrid  Semi-Automatic   286618   
..         ...     ...   ...          ...       ...             ...      ...   
75       Honda    CR-V  2022          3.4    Diesel       Automatic   248894   
76    Mercedes     GLA  2007          2.3  Electric          Manual    76458   
77         BMW      X5  2017          3.9    Hybrid  Semi-Automatic     2020   
78     Hyundai  Sonata  2018          2.0    Hybrid       Automatic   121567   
79  Volkswagen    Golf  2023          3.5    Petrol  Semi-Automatic   139477   

    Doors  Owner_Count  Price  
0      

In [None]:
import dask.dataframe as dd
ddf = dd.read_csv('/content/car_price_dataset.csv')
print(ddf.head())

        Brand   Model  Year  Engine_Size Fuel_Type    Transmission  Mileage  \
0         Kia     Rio  2020          4.2    Diesel          Manual   289944   
1   Chevrolet  Malibu  2012          2.0    Hybrid       Automatic     5356   
2    Mercedes     GLA  2020          4.2    Diesel       Automatic   231440   
3        Audi      Q5  2023          2.0  Electric          Manual   160971   
4  Volkswagen    Golf  2003          2.6    Hybrid  Semi-Automatic   286618   

   Doors  Owner_Count  Price  
0      3            5   8501  
1      2            3  12092  
2      4            2  11171  
3      2            1  11780  
4      3            3   2867  


In [None]:
df = pd.read_excel("/content/Historicalinvesttemp.xlsx", sheet_name="Sheet1") # Specify sheet name if needed
print(df.head())

  Unnamed: 0                        Unnamed: 1 Unnamed: 2 Unnamed: 3
0        NaN                               NaN        NaN        NaN
1        NaN                               NaN        NaN        NaN
2        NaN                               NaN        NaN        NaN
3        NaN                               NaN        NaN        NaN
4        NaN  Annual Returns on Investments in        NaN        NaN


In [None]:
df = pd.read_json("iris.json")
print(df.head())

   sepalLength  sepalWidth  petalLength  petalWidth species
0          5.1         3.5          1.4         0.2  setosa
1          4.9         3.0          1.4         0.2  setosa
2          4.7         3.2          1.3         0.2  setosa
3          4.6         3.1          1.5         0.2  setosa
4          5.0         3.6          1.4         0.2  setosa


In [None]:
import sqlite3
# Connect to a database
conn = sqlite3.connect("/content/sakila.db")
# Run a SQL query
df = pd.read_sql_query("SELECT * FROM my_table", conn)
print(df.head())

DatabaseError: Execution failed on sql 'SELECT * FROM my_table': no such table: my_table