In [2]:
import pandas as pd

import numpy as np
 
# Create a NumPy array

aidata_numpy = np.array([

    ["Blue", [1, 2], 1.1],

    ["Red", [3, 4], 2.2],

    ["Pink", [5, 6], 3.3],

    ["Grey", [7, 8], 4.4],

    ["Black", [9, 10], 5.5],

    ["Violet", [12, 10], 6.6 ]

], dtype=object)  # Specify dtype as object
 
# Create a DataFrame from the NumPy array

df_from_numpy = pd.DataFrame(aidata_numpy, columns=["color", "list", "number"])
 
# Print DataFrame and types

print("DataFrame created from NumPy array:\n", df_from_numpy)

print("\nTypes for each column:")

print(df_from_numpy.dtypes)

print("\nTypes of the first value of every column:")

print(df_from_numpy.iloc[0].apply(type))


DataFrame created from NumPy array:
     color      list number
0    Blue    [1, 2]    1.1
1     Red    [3, 4]    2.2
2    Pink    [5, 6]    3.3
3    Grey    [7, 8]    4.4
4   Black   [9, 10]    5.5
5  Violet  [12, 10]    6.6

Types for each column:
color     object
list      object
number    object
dtype: object

Types of the first value of every column:
color       <class 'str'>
list       <class 'list'>
number    <class 'float'>
Name: 0, dtype: object


In [3]:
import pandas as pd
import ssl
from urllib.request import urlopen
 
# Disable SSL certificate verification
ssl._create_default_https_context = ssl._create_unverified_context
 
# URL of the dataset
url = "https://assets.01-edu.org/ai-branch/piscine-ai/household_power_consumption.txt"
 
# Download the dataset
response = urlopen(url)
data = response.read().decode("utf-8")
 
# Save the data to a temporary file
temp_file_path = "household_power_consumption.txt"
with open(temp_file_path, "w") as temp_file:
    temp_file.write(data)
 
# Load the dataset into a DataFrame
df = pd.read_csv(temp_file_path, sep=';', header=0, low_memory=False)
 
# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())
 
# Assuming your data is loaded into a DataFrame called 'df'
# Step 1: Delete specified columns
columns_to_delete = ['Time', 'Sub_metering_2', 'Sub_metering_3']
df.drop(columns=columns_to_delete, inplace=True)
 
# Step 2: Set 'Date' as index and handle date parsing
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df.set_index('Date', inplace=True)
 
# Step 3: Create a function to update types
def update_types(df):
    # Convert 'Sub_metering_1' to numeric and update types
    df['Sub_metering_1'] = pd.to_numeric(df['Sub_metering_1'], errors='coerce')
    df['Sub_metering_1'] = (df['Sub_metering_1'].fillna(0) + 1) * 0.06
    return df
 
df = update_types(df)
 
# Step 4: Describe to get an overview of the dataset
print(df.describe())
 
# Step 5: Delete rows with missing values
df.dropna(inplace=True)
 
# Step 6: Modify 'Sub_metering_1'
df['Sub_metering_1'] = (df['Sub_metering_1'] + 1) * 0.06
 
# Step 7: Select rows based on criteria
# Convert 'Global_active_power' and 'Voltage' to numeric for comparison
df['Global_active_power'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
df['Voltage'] = pd.to_numeric(df['Voltage'], errors='coerce')
 
selected_rows = df[(df.index >= '2008-12-27') & (df['Voltage'] >= 242)]
print("Selected rows based on the criteria:")
print(selected_rows)
 
# Step 8: Print the 88888th row
print(df.iloc[88887])
 
# Step 9: Date for which 'Global_active_power' is maximal
max_global_active_power_date = df['Global_active_power'].idxmax()
print('Date for max Global_active_power:', max_global_active_power_date)
 
# Step 10: Sort the first three columns
df.sort_values(by=['Global_active_power', 'Voltage'], ascending=[False, True], inplace=True)
 
# Step 11: Compute daily average of 'Global_active_power'
daily_avg_global_active_power = df['Global_active_power'].resample('D').mean()
 
# Print the daily average of 'Global_active_power'
print('Daily average of Global_active_power:')
print(daily_avg_global_active_power)

First few rows of the DataFrame:
         Date      Time Global_active_power Global_reactive_power  Voltage  \
0  16/12/2006  17:24:00               4.216                 0.418  234.840   
1  16/12/2006  17:25:00               5.360                 0.436  233.630   
2  16/12/2006  17:26:00               5.374                 0.498  233.290   
3  16/12/2006  17:27:00               5.388                 0.502  233.740   
4  16/12/2006  17:28:00               3.666                 0.528  235.680   

  Global_intensity Sub_metering_1 Sub_metering_2  Sub_metering_3  
0           18.400          0.000          1.000            17.0  
1           23.000          0.000          1.000            16.0  
2           23.000          0.000          2.000            17.0  
3           23.000          0.000          1.000            17.0  
4           15.800          0.000          1.000            17.0  
       Sub_metering_1
count    2.075259e+06
mean     1.264727e-01
std      3.669401e-01
min     