# Pandas : Dataset

## Create DataFrames from data structures

In [None]:
import numpy as np
import pandas as pd

data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

dt_from_array = pd.DataFrame(data,columns=['A','B','C'])
print(dt_from_array)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [None]:
data2 = [[1,'Jhon',22],[2,'Marry',33],[3,'Peter',44]]

df_from_list = pd.DataFrame(data2,columns=['ID','Name','Age'])
print(df_from_list)

   ID   Name  Age
0   1   Jhon   22
1   2  Marry   33
2   3  Peter   44


In [None]:
data = [{'ID':1,'Name':'Jhon','Age':22},
        {'ID':2,'Name':'Marry','Age':33},
        {'ID':3,'Name':'Peter','Age':44}]

df_from_dict = pd.DataFrame(data)
print(df_from_dict)

   ID   Name  Age
0   1   Jhon   22
1   2  Marry   33
2   3  Peter   44


In [None]:
data = {'ID':pd.Series([1,2,3]),
        'Name':pd.Series(['Jhon','Marry','Peter']),
        'Age':pd.Series([22,33,44])}


df_from_series= pd.DataFrame(data)
print(df_from_series)

   ID   Name  Age
0   1   Jhon   22
1   2  Marry   33
2   3  Peter   44


## Retail Data

In [1]:
import kagglehub
import shutil
import os

# Descargar la última versión del dataset
path = kagglehub.dataset_download("tunguz/online-retail")

print("Dataset descargado en:", path)

# Obtener la carpeta actual
current_dir = os.getcwd()

# Crear una carpeta destino dentro del directorio actual
dest_dir = os.path.join(current_dir, "online-retail")
os.makedirs(dest_dir, exist_ok=True)

# Copiar los archivos descargados a la carpeta actual
for file_name in os.listdir(path):
    full_file_name = os.path.join(path, file_name)
    if os.path.isfile(full_file_name):
        shutil.copy(full_file_name, dest_dir)

print(f"Archivos copiados a: {dest_dir}")


Dataset descargado en: /home/pinpa/.cache/kagglehub/datasets/tunguz/online-retail/versions/1
Archivos copiados a: /home/pinpa/Documents/programacion/platzi_course/data_manipulate_visualization/numpy_pandas/online-retail


In [2]:
import pandas as pd 

directory = "/home/pinpa/Documents/programacion/platzi_course/data_manipulate_visualization/numpy_pandas/online-retail/Online_Retail.csv"

df = pd.read_csv(directory, encoding='latin1')
print(df.head())

print("Type Dataframe:")
print(type(df))

  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

    InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/10 8:26       2.55     17850.0  United Kingdom  
1  12/1/10 8:26       3.39     17850.0  United Kingdom  
2  12/1/10 8:26       2.75     17850.0  United Kingdom  
3  12/1/10 8:26       3.39     17850.0  United Kingdom  
4  12/1/10 8:26       3.39     17850.0  United Kingdom  
Type Dataframe:
<class 'pandas.core.frame.DataFrame'>


In [3]:
#Nombres de columnas
columnas_names= df.columns
print(columnas_names)

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')


In [4]:
num_rows, num_columns= df.shape
print(f"Numero de filas: {num_rows}")
print(f"Numero de columnas: {num_columns}")

Numero de filas: 541909
Numero de columnas: 8


In [5]:
daily_sales = df['Quantity']
print(daily_sales)

0          6
1          6
2          8
3          6
4          6
          ..
541904    12
541905     6
541906     4
541907     4
541908     3
Name: Quantity, Length: 541909, dtype: int64


In [6]:
summary = df.describe()
print(summary)

            Quantity      UnitPrice     CustomerID
count  541909.000000  541909.000000  406829.000000
mean        9.552250       4.611114   15287.690570
std       218.081158      96.759853    1713.600303
min    -80995.000000  -11062.060000   12346.000000
25%         1.000000       1.250000   13953.000000
50%         3.000000       2.080000   15152.000000
75%        10.000000       4.130000   16791.000000
max     80995.000000   38970.000000   18287.000000


In [7]:
mean_value=daily_sales.mean()
median_value=daily_sales.median()
std_deviation=daily_sales.std()
sum_value=daily_sales.sum()
print(f"Mean: {mean_value}")
print(f"Median: {median_value}")
print(f"Standard Deviation: {std_deviation}")
print(f"Sum: {sum_value}")

Mean: 9.55224954743324
Median: 3.0
Standard Deviation: 218.08115784986612
Sum: 5176450


In [8]:
count_values = daily_sales.count()
print(f"Count: {count_values}")

Count: 541909


In [11]:
df.tail(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541899,581587,22726,ALARM CLOCK BAKELIKE GREEN,4,12/9/11 12:50,3.75,12680.0,France
541900,581587,22730,ALARM CLOCK BAKELIKE IVORY,4,12/9/11 12:50,3.75,12680.0,France
541901,581587,22367,CHILDRENS APRON SPACEBOY DESIGN,8,12/9/11 12:50,1.95,12680.0,France
541902,581587,22629,SPACEBOY LUNCH BOX,12,12/9/11 12:50,1.95,12680.0,France
541903,581587,23256,CHILDRENS CUTLERY SPACEBOY,4,12/9/11 12:50,4.15,12680.0,France
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/11 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/11 12:50,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/11 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/11 12:50,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,12/9/11 12:50,4.95,12680.0,France


### iloc loc

In [None]:
firts_row = df.iloc[:]# Access by position
print(firts_row)

InvoiceNo                            536365
StockCode                             22752
Description    SET 7 BABUSHKA NESTING BOXES
Quantity                                  2
InvoiceDate                    12/1/10 8:26
UnitPrice                              7.65
CustomerID                          17850.0
Country                      United Kingdom
Name: 5, dtype: object
