# Generate Randomized Data Set v1

In [1]:
import numpy as np
import pandas as pd

def generate_random_df(num_rows=10000):
    # 1) Integer column
    int_col = np.random.randint(0, 1000, size=num_rows)
    
    # 2) Float column
    float_col = np.random.rand(num_rows)
    
    # 3) String (object) column
    #    We'll pick random strings of length 5 composed of uppercase letters
    random_strings = [
        ''.join(np.random.choice(list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), size=5))
        for _ in range(num_rows)
    ]
    
    # 4) Boolean column
    bool_col = np.random.choice([True, False], size=num_rows)
    
    # 5) Datetime column
    #    Generate a date range, then pick random samples from it
    date_range = pd.date_range('2000-01-01', periods=2*num_rows, freq='D')
    datetime_col = np.random.choice(date_range, size=num_rows)
    
    # 6) Timedelta column
    #    Create random integer days, then convert to Timedelta
    rand_days = np.random.randint(0, 1000, size=num_rows)
    timedelta_col = pd.to_timedelta(rand_days, unit='D')
    
    # 7) Categorical column
    categories = ['Red', 'Green', 'Blue', 'Yellow', 'Black']
    cat_col = np.random.choice(categories, size=num_rows)
    cat_col = pd.Categorical(cat_col, categories=categories)
    
    # 8) Complex column
    #    Combine random floats for real and imaginary parts
    complex_col = np.random.rand(num_rows) + 1j * np.random.rand(num_rows)
    
    # Assemble into a dictionary
    data = {
        'int_col': int_col,
        'float_col': float_col,
        'str_col': random_strings,
        'bool_col': bool_col,
        'datetime_col': datetime_col,
        'timedelta_col': timedelta_col,
        'cat_col': cat_col,
        'complex_col': complex_col
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

# Example usage
df_random = generate_random_df(10000)
print(df_random.info())
print(df_random.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype          
---  ------         --------------  -----          
 0   int_col        10000 non-null  int32          
 1   float_col      10000 non-null  float64        
 2   str_col        10000 non-null  object         
 3   bool_col       10000 non-null  bool           
 4   datetime_col   10000 non-null  datetime64[ns] 
 5   timedelta_col  10000 non-null  timedelta64[ns]
 6   cat_col        10000 non-null  category       
 7   complex_col    10000 non-null  complex128     
dtypes: bool(1), category(1), complex128(1), datetime64[ns](1), float64(1), int32(1), object(1), timedelta64[ns](1)
memory usage: 527.7+ KB
None
   int_col  float_col str_col  bool_col datetime_col timedelta_col cat_col  \
0      123   0.877557   DZXKE     False   2006-05-29      849 days  Yellow   
1      896   0.853197   HHRKF      True   2011-12-11      747 days   Gre

In [2]:
df_random

Unnamed: 0,int_col,float_col,str_col,bool_col,datetime_col,timedelta_col,cat_col,complex_col
0,123,0.877557,DZXKE,False,2006-05-29,849 days,Yellow,0.930888+0.949241j
1,896,0.853197,HHRKF,True,2011-12-11,747 days,Green,0.154265+0.759217j
2,154,0.829659,BGRER,False,2002-09-04,15 days,Blue,0.077769+0.241896j
3,342,0.871161,DSIKD,True,2034-11-26,992 days,Blue,0.777767+0.434159j
4,14,0.778305,HPJED,False,2009-02-06,976 days,Red,0.059707+0.024499j
...,...,...,...,...,...,...,...,...
9995,817,0.226143,SMOCX,True,2033-11-04,236 days,Green,0.735022+0.195327j
9996,204,0.965456,QNDRJ,False,2018-01-15,723 days,Blue,0.828005+0.887696j
9997,662,0.637535,SUFRD,False,2024-05-06,239 days,Yellow,0.042238+0.896905j
9998,280,0.754311,ZECGS,False,2051-12-14,545 days,Black,0.355024+0.686082j


In [4]:
df_random['cat_col'].value_counts()

cat_col
Green     2051
Red       2023
Blue      2014
Black     2002
Yellow    1910
Name: count, dtype: int64

In [5]:
df = df_random.copy()

max_datetime_indices = df.groupby('cat_col')['datetime_col'].idxmax()
df_max_datetime_per_cat = df.loc[max_datetime_indices]

# min_datetime_indices = df.groupby('cat_col')['datetime_col'].idxmin()
# df_min_datetime_per_cat = df.loc[min_datetime_indices]

display(df_max_datetime_per_cat)

  max_datetime_indices = df.groupby('cat_col')['datetime_col'].idxmax()


Unnamed: 0,int_col,float_col,str_col,bool_col,datetime_col,timedelta_col,cat_col,complex_col
403,495,0.487373,UTJUG,True,2054-09-29,499 days,Red,0.415588+0.466032j
9808,931,0.725176,EAEDD,False,2054-10-03,388 days,Green,0.888295+0.116972j
8387,606,0.040744,UJDKT,True,2054-09-03,380 days,Blue,0.291439+0.576668j
2075,899,0.582144,YVNVB,True,2054-09-25,945 days,Yellow,0.886086+0.750267j
7588,101,0.629371,TPYGL,True,2054-09-20,636 days,Black,0.124976+0.751283j


In [6]:
df_top3_datetime_per_cat = (
    df
    .groupby('cat_col', group_keys=True)
    .apply(lambda g: g.nlargest(3, 'datetime_col'))
)

display(df_top3_datetime_per_cat)

# df_top3_smallest_per_cat = (
#     df
#     .groupby('cat_col', group_keys=True)
#     .apply(lambda g: g.nsmallest(3, 'datetime_col'))
# )

# display(df_top3_smallest_per_cat)

  .groupby('cat_col', group_keys=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,int_col,float_col,str_col,bool_col,datetime_col,timedelta_col,cat_col,complex_col
cat_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Red,403,495,0.487373,UTJUG,True,2054-09-29,499 days,Red,0.415588+0.466032j
Red,7345,484,0.624321,BKADV,True,2054-09-27,224 days,Red,0.205280+0.064991j
Red,425,44,0.068021,QDKGI,True,2054-09-26,261 days,Red,0.103562+0.831743j
Green,9808,931,0.725176,EAEDD,False,2054-10-03,388 days,Green,0.888295+0.116972j
Green,5529,894,0.285167,BQYVS,False,2054-09-20,133 days,Green,0.924488+0.390266j
Green,4359,555,0.848221,EOIEV,False,2054-08-26,62 days,Green,0.591212+0.466222j
Blue,8387,606,0.040744,UJDKT,True,2054-09-03,380 days,Blue,0.291439+0.576668j
Blue,6139,824,0.877712,EYZLL,False,2054-08-29,966 days,Blue,0.058777+0.071702j
Blue,1817,490,0.914366,JPJAZ,False,2054-08-25,128 days,Blue,0.169772+0.040728j
Yellow,2075,899,0.582144,YVNVB,True,2054-09-25,945 days,Yellow,0.886086+0.750267j
