In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

In [64]:
data = [
    {"guest_id": 1, "ds_checkin": "2023-06-16", "ds_checkout": "2023-06-20", "dim_market":"B", "price": 100},
    {"guest_id": 1, "ds_checkin": "2023-06-10", "ds_checkout": "2023-06-15", "dim_market":"A", "price": 200},
    {"guest_id": 1, "ds_checkin": "2023-06-21", "ds_checkout": "2023-06-25", "dim_market":"C", "price": 400},
    {"guest_id": 1, "ds_checkin": "2023-07-01", "ds_checkout": "2023-07-05", "dim_market":"D", "price": 300},
    {"guest_id": 2, "ds_checkin": "2023-07-02", "ds_checkout": "2023-07-07", "dim_market":"A", "price": 120},
    {"guest_id": 2, "ds_checkin": "2023-07-21", "ds_checkout": "2023-07-22", "dim_market":"BB", "price": 100}
]
df = pd.DataFrame(data)

In [65]:
df

Unnamed: 0,guest_id,ds_checkin,ds_checkout,dim_market,price
0,1,2023-06-16,2023-06-20,B,100
1,1,2023-06-10,2023-06-15,A,200
2,1,2023-06-21,2023-06-25,C,400
3,1,2023-07-01,2023-07-05,D,300
4,2,2023-07-02,2023-07-07,A,120
5,2,2023-07-21,2023-07-22,BB,100


In [66]:
# Extract numpy array from 
df['guest_id'].values

array([1, 1, 1, 1, 2, 2])

## Simple transformation

In [52]:
# Binning numerical columns into categorical by quantiles
df['price_bin'] = pd.qcut(df['price'], 4, labels = ['q1','q2','q3','q4'])

In [53]:
df

Unnamed: 0,guest_id,ds_checkin,ds_checkout,dim_market,price,price_bin
0,1,2023-06-16,2023-06-20,B,100,q1
1,1,2023-06-10,2023-06-15,A,200,q3
2,1,2023-06-21,2023-06-25,C,400,q4
3,1,2023-07-01,2023-07-05,D,300,q4
4,2,2023-07-02,2023-07-07,A,120,q2
5,2,2023-07-21,2023-07-22,BB,100,q1


# Datetime formatting

In [18]:
df['ds_checkin'] = pd.to_datetime(df['ds_checkin'], format = "%Y-%m-%d")
df['ds_checkout'] = df['ds_checkout'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))

In [19]:
df

Unnamed: 0,guest_id,ds_checkin,ds_checkout,dim_market
0,1,2023-06-16,2023-06-20,B
1,1,2023-06-10,2023-06-15,A
2,1,2023-06-21,2023-06-25,C
3,1,2023-07-01,2023-07-05,D
4,2,2023-07-02,2023-07-07,A
5,2,2023-07-21,2023-07-22,BB


## Sort dataframe

In [20]:
df = df.sort_values(by = ['guest_id', 'ds_checkin', 'ds_checkout'], ascending = [True, True, True])

In [99]:
df

Unnamed: 0,guest_id,ds_checkin,ds_checkout,dim_market,price,ds_checkin_rank,ds_checkin_rank_1
1,1,2023-06-10,2023-06-15,A,200,1.0,2.0
0,1,2023-06-16,2023-06-20,B,100,2.0,3.0
2,1,2023-06-21,2023-06-25,C,400,3.0,4.0
3,1,2023-07-01,2023-07-05,D,300,4.0,5.0
4,2,2023-07-02,2023-07-07,A,120,1.0,2.0
5,2,2023-07-21,2023-07-22,BB,100,2.0,3.0


In [105]:
df.groupby('guest_id').price.agg(lambda x: x.mean())

guest_id
1    250.0
2    110.0
Name: price, dtype: float64

## Grouping and transform

In [22]:
df['ds_checkout_shift'] = df.groupby('guest_id')['ds_checkout'].shift(1)

In [24]:
df

Unnamed: 0,guest_id,ds_checkin,ds_checkout,dim_market,ds_checkout_shift
1,1,2023-06-10,2023-06-15,A,NaT
0,1,2023-06-16,2023-06-20,B,2023-06-15
2,1,2023-06-21,2023-06-25,C,2023-06-20
3,1,2023-07-01,2023-07-05,D,2023-06-25
4,2,2023-07-02,2023-07-07,A,NaT
5,2,2023-07-21,2023-07-22,BB,2023-07-07


In [27]:
df['time_delta'] = (df['ds_checkin'] - df['ds_checkout_shift']).dt.days

In [28]:
df

Unnamed: 0,guest_id,ds_checkin,ds_checkout,dim_market,ds_checkout_shift,time_delta
1,1,2023-06-10,2023-06-15,A,NaT,
0,1,2023-06-16,2023-06-20,B,2023-06-15,1.0
2,1,2023-06-21,2023-06-25,C,2023-06-20,1.0
3,1,2023-07-01,2023-07-05,D,2023-06-25,6.0
4,2,2023-07-02,2023-07-07,A,NaT,
5,2,2023-07-21,2023-07-22,BB,2023-07-07,14.0


In [81]:
df['ds_checkin_rank'] = df.groupby('guest_id').ds_checkin.rank(method = 'first', ascending = True)

In [97]:
df_market = df.groupby('guest_id').dim_market.apply(lambda x: list(x)).reset_index()
df_market

Unnamed: 0,guest_id,dim_market
0,1,"[A, B, C, D]"
1,2,"[A, BB]"


In [98]:
df_market.explode('dim_market')

Unnamed: 0,guest_id,dim_market
0,1,A
0,1,B
0,1,C
0,1,D
1,2,A
1,2,BB


## Assign values by condition

In [29]:
df.loc[(df.time_delta.isnull()) | (df.time_delta > 3), 'trip_first_ds'] = df.loc[(df.time_delta.isnull()) | (df.time_delta > 3), 'ds_checkin']


## Filling missing data

In [31]:
df['trip_first_ds'] = df['trip_first_ds'].fillna(method = 'ffill')

In [36]:
# grouping and aggregation
result1 = df.groupby(['guest_id', 'trip_first_ds']).dim_market.first().reset_index()
result1

Unnamed: 0,guest_id,trip_first_ds,dim_market
0,1,2023-06-10,A
1,1,2023-07-01,D
2,2,2023-07-02,A
3,2,2023-07-21,BB


In [38]:
result2 = df.groupby(['guest_id', 'trip_first_ds']).dim_market.count().reset_index()
result2

Unnamed: 0,guest_id,trip_first_ds,dim_market
0,1,2023-06-10,3
1,1,2023-07-01,1
2,2,2023-07-02,1
3,2,2023-07-21,1


In [49]:
result3 = df.groupby(['guest_id', 'trip_first_ds']).agg({'dim_market': list}).reset_index()
result3

Unnamed: 0,guest_id,trip_first_ds,dim_market
0,1,2023-06-10,"[A, B, C]"
1,1,2023-07-01,[D]
2,2,2023-07-02,[A]
3,2,2023-07-21,[BB]


In [43]:
result4 = df.groupby(['guest_id', 'trip_first_ds']).dim_market.unique().reset_index()
result4

Unnamed: 0,guest_id,trip_first_ds,dim_market
0,1,2023-06-10,"[A, B, C]"
1,1,2023-07-01,[D]
2,2,2023-07-02,[A]
3,2,2023-07-21,[BB]


In [54]:
data = {
    "item": ['a', 'b', 'c'],
    "list_of_items": [['a', 'b', 'c'], ['x', 'b', 'y'], ['c', 'd', 'e']]
}

# Creating the DataFrame
df = pd.DataFrame(data)

# Find the index of the item in the list for each row
df['index'] = df.apply(lambda row: row['list_of_items'].index(row['item']) if row['item'] in row['list_of_items'] else None, axis=1)

print(df)

  item list_of_items  index
0    a     [a, b, c]      0
1    b     [x, b, y]      1
2    c     [c, d, e]      0


In [62]:
df['index'].fillna('').apply(lambda x: x*2).values

numpy.ndarray

In [57]:
index_map = df.set_index('item')['index'].to_dict()

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text data
documents = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(4, 4))

# Fit and transform the text data to create character-level 4-gram TF-IDF sparse vectors
X = vectorizer.fit_transform(documents)

# X is a sparse matrix representing the TF-IDF vectors
print("TF-IDF Sparse Matrix:")
print(X)

# To see the feature names (4-grams)
print("\nFeature Names (4-grams):")
print(vectorizer.get_feature_names())


TF-IDF Sparse Matrix:
  (0, 19)	0.2342968107614109
  (0, 32)	0.189683460929104
  (0, 51)	0.189683460929104
  (0, 9)	0.189683460929104
  (0, 36)	0.189683460929104
  (0, 13)	0.189683460929104
  (0, 0)	0.189683460929104
  (0, 46)	0.2342968107614109
  (0, 45)	0.2342968107614109
  (0, 40)	0.2342968107614109
  (0, 28)	0.2342968107614109
  (0, 21)	0.2342968107614109
  (0, 1)	0.2342968107614109
  (0, 14)	0.2342968107614109
  (0, 22)	0.2342968107614109
  (0, 48)	0.15507865836785725
  (0, 5)	0.15507865836785725
  (0, 43)	0.15507865836785725
  (0, 31)	0.15507865836785725
  (0, 2)	0.189683460929104
  (0, 42)	0.2342968107614109
  (0, 30)	0.2342968107614109
  (0, 26)	0.15507865836785725
  (0, 50)	0.15507865836785725
  (1, 10)	0.19447465538892725
  :	:
  (2, 30)	0.1870245011659935
  (2, 26)	0.12378960101285746
  (2, 50)	0.12378960101285746
  (3, 20)	0.2850562907261762
  (3, 6)	0.2247416538009314
  (3, 32)	0.18194773786870677
  (3, 51)	0.18194773786870677
  (3, 9)	0.18194773786870677
  (3, 36)	0.18194

