In [None]:
import pandas as pd
import numpy as np
import random

%run datavis.ipynb

In [None]:
# remove_nan_rows (edge)
data = {"Column 1": [np.nan, np.nan],
        "Column 2": [2, np.nan]}

df = pd.DataFrame(data)

print(remove_nan_rows(df))

   Column 1  Column 2
0       NaN       2.0
1       NaN       NaN
Empty DataFrame
Columns: [Column 1, Column 2]
Index: []


In [None]:
# remove_nan_rows (general)
data = {"Column 1": [2, 3, 5],
        "Column 2": [2, np.nan, "b"],
        "Column 3": [9, "a", 1]}

df = pd.DataFrame(data)

print(remove_nan_rows(df))

   Column 1 Column 2 Column 3
0         2        2        9
2         5        b        1


In [None]:
# remove_nan_rows (random)
data = {}
for i in range(random.randint(1, 5)):
    values = []
    for j in range(3):
        if random.randint(1, 10) == 4:
            values.append(np.nan)
        else:
            values.append(j)
    data["Column " + str(i)] = values

df = pd.DataFrame(data)

print(remove_nan_rows(df))

{'Column 0': [nan, nan, 2], 'Column 1': [0, 1, nan]}
Empty DataFrame
Columns: [Column 0, Column 1]
Index: []


In [17]:
# replace_nan_with_mean (edge)
data = {"Column 1": [np.nan],
        "Column 2": [np.nan]}

df = pd.DataFrame(data)

print(replace_nan_with_mean(df))

   Column 1  Column 2
0       NaN       NaN


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[col].fillna(df_copy[col].mean(), inplace=True)


In [20]:
# replace_nan_with_mean (general)
data = {"Column 1": [2, 3, 5],
        "Column 2": [2, np.nan, 7],
        "Column 3": [9, 2, 1]}

df = pd.DataFrame(data)

print(replace_nan_with_mean(df))

   Column 1  Column 2  Column 3
0         2       2.0         9
1         3       4.5         2
2         5       7.0         1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[col].fillna(df_copy[col].mean(), inplace=True)


In [25]:
# replace_nan_with_mean (random)
data = {}
for i in range(random.randint(1, 5)):
    values = []
    for j in range(3):
        if random.randint(1, 10) == 4:
            values.append(np.nan)
        else:
            values.append(j)
    data["Column " + str(i)] = values

df = pd.DataFrame(data)

print(replace_nan_with_mean(df))

   Column 0  Column 1  Column 2  Column 3  Column 4
0         0         0       0.0       1.5         0
1         1         1       1.0       1.0         1
2         2         2       0.5       2.0         2


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy[col].fillna(df_copy[col].mean(), inplace=True)


In [None]:
# one_hot_encode (edge)
data = {"Column 1": ["1", "1", "1"]}

df = pd.DataFrame(data)

print(one_hot_encode(df["Column 1"], ["0", "1"]))

   Column 1_0  Column 1_1
0           0        True
1           0        True
2           0        True


In [34]:
# one_hot_encode (general)
data = {"Column 1": ["1", "0", "1", "0", "1", "1", "0"]}

df = pd.DataFrame(data)

print(one_hot_encode(df["Column 1"], ["0", "1"]))

   Column 1_0  Column 1_1
0       False        True
1        True       False
2       False        True
3        True       False
4       False        True
5       False        True
6        True       False


In [36]:
# one_hot_encode (random)
values = []
for i in range(random.randint(1, 15)):
    values.append(random.randint(0, 1))
data = {"Column 1": values}

df = pd.DataFrame(data)

print(one_hot_encode(df["Column 1"], ["0", "1"]))

   Column 1_0  Column 1_1
0        True       False
1        True       False
2       False        True
3       False        True
4       False        True
5       False        True
6       False        True


In [None]:
# summarize (edge)
data = {"Column 1": []}

df = pd.DataFrame(data)

print(summarize(df))


Summary for Column 1:
Stats: mean   NaN,   std    NaN,   min    NaN,   50%    NaN,   max    NaN
None


In [45]:
# summarize (general)
data = {"Column 1": [1, 5, 112],
        "Column 2": ["A", "R", "ASUIH"],
        "Column 3": [34, -1, 2]}

df = pd.DataFrame(data)

print(summarize(df))


Summary for Column 2:
Unique values: 3
Column 2
A        1
R        1
ASUIH    1
Name: count, dtype: int64

Summary for Column 1:
Stats: mean     39.333333,   std      62.962952,   min       1.000000,   50%       5.000000,   max     112.000000

Summary for Column 3:
Stats: mean    11.666667,   std     19.399313,   min     -1.000000,   50%      2.000000,   max     34.000000
None


In [47]:
# summarize (random)
import string

data = {}
elements = random.randint(1, 5)
for i in range(random.randint(1, 5)):
    values = []
    if random.randint(0, 1) == 0:
        for j in range(elements):
            values.append(random.randint(1, 100))
    else:
        for j in range(elements):
            values.append(''.join(random.choices(string.ascii_letters + string.digits, k=random.randint(5, 25))))
    data["Column " + str(i)] = values

df = pd.DataFrame(data)

print(summarize(df))


Summary for Column 1:
Unique values: 4
Column 1
e8sAvNMoX0         1
FicZO87Z55eYPBa    1
hD9Em0             1
ZsRPQJI7H30n       1
Name: count, dtype: int64

Summary for Column 2:
Unique values: 4
Column 2
kAj6NMKMHG                   1
ghYDJNQVVrLCa2Kj75wasEg9X    1
6Ay3KRZFbEc1EbAQKdmq6N6Y     1
9unZOql0GLEA9KeHF2Mnt7o      1
Name: count, dtype: int64

Summary for Column 0:
Stats: mean    47.75000,   std     30.81531,   min     22.00000,   50%     39.50000,   max     90.00000
None
