In [None]:
#Standardize columns
standard_columns = [data.columns[i].lower() for i in range(len(data.columns))]
standard_columns = [col_name.lower().replace(' ', '_') for col_name in data.columns]
data.columns = standard_columns

In [None]:
#Removing outliers automatically
def remove_outliers(df, threshold=1.5, in_columns=df.select_dtypes(np.number).columns, skip_columns=[]):
    for column in in_columns:
        if column not in skip_columns:
            upper = np.percentile(df[column],75)
            lower = np.percentile(df[column],25)
            iqr = upper - lower
            upper_limit = upper + threshold * iqr
            lower_limit = lower - threshold * iqr
            df = df[(df[column]>lower_limit) & (df[column]<upper_limit)]
    return df
#df2_ = remove_outliers(df2_) - apply
#df2 = remove_outliers(df1, threshold=1.5, in_columns=['target_d', 'avggift']) - remove outliers from specific column

In [None]:
#Changing weight to kilograms(float)
def weight_kilos(x): 
    weight = str(x).replace("lbs",' ')
    new_weight = round(float(weight)*0.4535,0) 
    return new_weight
#data['weight']  = data['weight'].apply(lambda x: weight_kilos(x)) - Application

In [None]:
#Changing height to centimeters(float)
def height_centimeters(x): 
    height = str(x).replace("'",' ').replace('"',' ').split()
    new_height = float(height[0])*30.48 + float(height[1])*2.54
    return new_height
#my_data2['height']  = my_data2['height'].apply(lambda x: height_centimeters(x)) - Application

In [None]:
# Get the percentage of nulls in a datafroma
nulls = pd.DataFrame(data.isna().sum()*100/len(data), columns=['percentage'])
nulls.sort_values('percentage', ascending = False)

In [None]:
# Filtering dataframe:  Check those that gave
len(df[(df['target_d'] > 10) & (df['target_d'] < 50)]) , len(df[df['target_d'] >= 50])

In [None]:
# Deaing with categorical columns
# Remove those with more than 50 different categories
remove_cols = []

for col in dfcat:
    if len(dfcat[col].unique()) > 50:
        display(dfcat[col].value_counts())
        remove_cols.append(col)
        
len(remove_cols)
# if we had more time, analysing one by one and checking the possibility of bucketing them would be ideal
# another important thing would be to visualize the average donations within each category, to check for patterns
# yet another approach would be to check for columns with hierarchical values,
# so you could replace by numerical and discrete values

In [None]:
# In the first run, I prefer not to have any scaling, just to have a benchmark on my metrics
# then i come back to these steps and check different scaling techniques to see which is better

# usually you'll choose one scaling technique for the whole dataframe

# if you are going for StandardScaler, MinMaxScaler or Normalizer, remember to do it after the splits:
## X-y split for Normalizer;
## train-test split for the other two

# for this example i chose BoxCox transformation

def boxcox_transform(df):
    numeric_cols = df.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        # since i know any columns should take negative numbers, to avoid -inf in df
        df[column] = np.where(df[column]<=0, np.NAN, df[column]) 
        df[column] = df[column].fillna(df[column].mean())
        transformed_data, ci = stats.boxcox(df[column])
        df[column] = transformed_data
        _ci[column] = [ci] 
    return df, _ci

# df, _ci = boxcox_transform(df) - Apply - if you want to overwrite the already existing df
# df

In [None]:
# plot heatmap
corr_matrix=data.corr(method='pearson')  # default
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
#Plot distribution of numerical variables
for col in data.select_dtypes(np.number):
    sns.displot(data[col])
    plt.show()

In [None]:
# apply log transformation since it will make the 'outliers' interval smaller
def log_transform(x):
    if np.isfinite(x) and x!=0:
        return np.log(x)
    else:
        return np.NAN

# or
log_transform = lambda x: np.log(x) if np.isfinite(x) and x!=0 else np.NAN


def log_scaled(df):
    numeric_cols = df.select_dtypes(np.number).columns
    for column in numeric_cols:
        df[column] = df[column].apply(log_transform).fillna(df[column].mean()) 
    return df
# data1 = data.copy()

# data1['TIMELAG'] = data1['TIMELAG'].apply(log_transform).fillna(data1['TIMELAG'].mean()) - Apply

In [None]:
#Fill NaNs with the mean of the columns
def fill_nan_with_mean(df):
    numeric_cols = df.select_dtypes(np.number).columns
    for column in numeric_cols:
        if df[column].isna().sum() != 0:
            df[column] = df[column].fillna(df[column].mean()) 
    return df

In [None]:
import math
# apply log10 transformation since it will make the 'outliers' interval smaller
def log_transform2(x):
    if np.isfinite(x) and x!=0:
        return math.log10(x)
    else:
        return np.NAN

In [None]:
# Filtering scaled dataframe to get a "new" dataframe
X_scaled = X.applymap(lambda x: np.nan if x == 0 else x).applymap(np.log).fillna(0)
y_scaled = y.apply(lambda x: np.nan if x == 0 else x).apply(np.log).fillna(0)

y_scaled_higher = y_scaled[y_scaled > 0]
x_scaled_higher = X_scaled.loc[y_scaled_higher.index]

In [None]:
# Using StandardScaler to scale data
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(numerics)
scaled = transformer.transform(numerics)
scaled = pd.DataFrame(scaled)
scaled.columns = numerics.columns 
scaled.head()

In [None]:
def target_encode_multiclass(X,y): #X,y are pandas df and series
    y=y.astype(str)   #convert to string to onehot encode
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns  #names of onehot encoded columns
    X_obj=X.select_dtypes('object') #separate categorical columns
    X=X.select_dtypes(exclude='object') 
    for class_ in class_names:
            
        enc=ce.TargetEncoder()
        enc.fit(X_obj,y_onehot[class_]) #convert all categorical 
        temp=enc.transform(X_obj)       #columns for class_
        temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
        X=pd.concat([X,temp],axis=1)    #add to original dataset
      
        return X

In [None]:
# How to use PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing
pca = PCA()
pca.fit(scaled)

In [None]:
# How to use RandomForest - Classification problem
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report

model = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y_group, test_size=0.3, random_state=42)

model.fit(X_train,y_train)

y_group_preds = model.predict(X_test)

print(classification_report(y_test, y_group_preds))

In [None]:
# in case you need to use the encode somewhere else besides your notebook:
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

encoded = encoder.transform(X_cat).toarray()
encoded #.shape # 

In [None]:
X = np.concatenate([X_num, encoded], axis=1)
X.shape

In [None]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#Train model with data
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#Get predictions using X_test data
predictions = model.predict(X_test)

In [None]:
#Get your linear metrics using y_test data - r-squared, mae, rmse
r2_score(y_test, predictions), mean_absolute_error(y_test, predictions), mean_squared_error(y_test, predictions, squared=False)

In [None]:
#To Reverse Boxcox scaling
# MAE and MSE are log transformed
# have to inverse transform
from scipy.special import inv_boxcox

predictions = inv_boxcox(predictions, _ci['target_d'])

In [None]:

#Dealing with Ordinal Categoricals - Hard coding 
customer_cats["coverage"] = customer_cats["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
customer_cats["education"] = customer_cats["education"].map({"High School or Below" : 0, "College" : 1, "Bachelor" : 2,
                                                                          "Master" : 3, "Doctor": 4})
customer_cats["employmentstatus"] = customer_cats["employmentstatus"].map({"Unemployed" : 0, "Disabled" : 1, "Retired" : 2,
                                                                          "Medical Leave" : 3, "Employed": 4})
customer_cats["location_code"] = customer_cats["location_code"].map({"Rural" : 0, "Suburban" : 1, "Urban" : 2})
customer_cats["vehicle_size"] = customer_cats["vehicle_size"].map({"Small" : 0, "Medsize" : 1, "Large" : 2})
customer_cats.head()

In [None]:
#Plot categorical variable
sns.catplot(x="zipcode", y="price", data=df2)

In [None]:
# Text formatting styles
print(f'''This model saved ${savings} in mails.
        Missed ${total_missed_donations} in donations and wasted ${total_extra_spent_on_mail} in mails not responded.
        You made: ${sum(df['target_d']) - total_amount_spent_on_sent_mails}
           ''')



In [None]:
# Using the variance threshold technique
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.9))
# This drops the columns that have a variance less than this threshold
selection.fit(numerical)
temp = selection.transform(numerical)
temp

In [None]:
#Checking correlations between variables with heatmaps
corr = dff_estate.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

fig, ax = plt.subplots(figsize=(15, 20))

ax = sns.heatmap(corr, mask=mask, cmap='coolwarm', vmax=1, center=0,
            square=True, linewidths=.5,annot=True, cbar_kws={"shrink": .5});

In [None]:
# Manually Upsampling
# getting sample with the same amount as the majority class
A = data[data['status'] == 'A'].sample(400, replace=True) # needs the replace because it has less than 400 rows
B = data[data['status'] == 'B'].sample(400, replace=True)
C = data[data['status'] == 'C'].sample(400) # don't need the replace because it has 403 rows
D = data[data['status'] == 'D'].sample(400, replace=True)

upsampled = pd.concat([A, B, C, D]).sample(frac=1) # .sample(frac=1) here is just to shuffle the dataframe
upsampled

In [None]:
#Upsampling automatically using SMOTE function
# SMOTE
# Uses knn to create rows with similar features from the minority classes.
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X = data.drop('status', axis=1)
y = data['status']

X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

In [None]:
# Manually downsampling
# getting sample with the same amount as the minority class
A = data[data['status'] == 'A'].sample(30)
B = data[data['status'] == 'B'].sample(30)
C = data[data['status'] == 'C'].sample(30)
D = data[data['status'] == 'D'].sample(30)

downsampled = pd.concat([A, B, C, D]).sample(frac=1) # .sample(frac=1) here is just to shuffle the dataframe
downsampled

In [None]:
# Tomek Links
# Pairs of almost similar rows from opposite classes.
# Removing the row of the majority class from each pair helps the classifier.
from imblearn.under_sampling import TomekLinks

X = data.drop('status', axis=1)
y = data['status']

tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(X, y)
y_tl.value_counts()

## Ooops - good for smaller imbalances

In [None]:

#Evaluating Logistic regression model results
cf_matrix = confusion_matrix(y_test, predictions)
group_names = ['True A', 'False A', 'False A', 'False A',
               'False B', 'True B', 'False B', 'False B',
               'False c', 'False C', 'True C', 'False C',
               'False D', 'False D', 'False D', 'True D']

group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(4,4)
sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classification, X_test, y_test, cmap='Blues')
plt.show()

In [None]:
#To merge dataframes 
general_model = model_5.merge(results, left_index = True, right_index=True)

In [1]:
### Classification - Random Forest

In [None]:
from imblearn.under_sampling import TomekLinks

# watch out not to inflate your metrics, ideally:
# you do the train-test split first and fit_resample only on the training set

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.3, random_state=42)

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X_train), y_train)

y_tl.value_counts()

### Some miscellenous functions and usage

In [None]:
#Data summary using groupby and aggregations
warshers.groupby('BrandName')[['Volume']].agg(['mean','median','min','max'])

In [None]:
#Composite Visualization
df.groupby('num_vrb')['cat_vrb'].value_counts() #get unique values of the variable we want

df.groupby('num_vrb')['cat_vrb'].value_counts().unstack() #pivot the innermost index(cat_vrb) to column labels
#Create the stacked bar chart using the pivot table created above 
df.groupby('num_vrb')['cat_vrb'].value_counts().unstack().plot(kind='bar',stacked=True,figsize=(10,6))

In [None]:
discrete_customer['effective_to_date'].astype(str).str.split('-').apply(lambda x: x[1]).astype(int)

In [None]:
# A function to take in several models and compare their metrics
def train_test_models(models, X_train, X_test, y_train, y_test):
    scores = {}
    model_names = [ ]
    for mod in models:
        model_names.append(str(mod)[:-2])
    

    for i, model_ in enumerate(models):
        model = model_.fit(X_train, y_train)
        model_prediction = model.predict(X_test)
        r_squared = r2_score(y_test, model_prediction)
        mae = mean_absolute_error(y_test, model_prediction)
        msqe = mean_squared_error(y_test, model_prediction, squared=False)
        

        scores[model_names[i]] = [r_squared, mae, msqe]

    return scores


### Web Scrapping

In [None]:
# In case of multiple paga scrapping
#Extract the total number of songs
sp.find('div', attrs={'class': "fts-header__title-container"}).find('h1').get_text().split()[2]
# Convert that number to integer
total_songs = int(sp.find('div', attrs={'class': "fts-header__title-container"}).find('h1').get_text().split()[2])
total_songs

In [None]:
starts = range(total_songs,0,-10) # because, 10 items per page and counting from highest(top - down)
list(starts)

In [None]:
#Get song titles and artists from one webpage
Titles = []
Artists = []

for song in songs_all:
    Titles.append(song.find('h3').get_text()) 
    Artists.append(song.find('span').get_text())

In [None]:
#Another way
for a in table:
    if a.find('h3'):   
        print(a.find('h3').get_text())

title = [a.find('h3').get_text().strip('\n') for a in table if a.find('h3')]
title

In [None]:
# Get the attributes in a container
def get_movie_info(movies):    
    titles = []
    ratings = []
    genres = []
    runtimes = []
    links = []

    for movie in movies:
        title = movie.find('h3').find('a').get_text() if movie.find('h3').find('a') else 'Not informed.'
        rating = movie.find('strong').get_text() if movie.find('strong') else 'Not informed.'
        genre = movie.find('span', attrs={'class': 'genre'}).get_text(strip=True) if movie.find('span', attrs={'class': 'genre'}) else 'Not informed.'
        runtime = movie.find('span', attrs={'class': 'runtime'}).get_text() if movie.find('span', attrs={'class': 'runtime'}) else 'Not informed.'
        link = 'http://www.imdb.com' + movie.find('h3', attrs={'class': 'lister-item-header'}).find('a').get('href')

        titles.append(title)
        ratings.append(rating)
        genres.append(genre)
        links.append(link)
        runtimes.append(runtime)
    
    dct = {'title': titles, 'rating': ratings, 'genre': genres, 'runtime': runtimes}
    
    return dct

In [None]:
# Create a dataframe from the pages of a website using the attributes from above
df = pd.DataFrame()

for start in starts:
    r = requests.get(f'https://www.imdb.com/search/title/?title_type=feature&release_date=2021-01-01,&user_rating=6.5,&num_votes=100,&start={start}&ref_=adv_nxt')
    soup = BeautifulSoup(r.content, 'html.parser')
    movies = soup.find_all('div', attrs={'class': "lister-item-content"})
    info_dct = get_movie_info(movies) # function from above for getting the attributes
    new_df = pd.DataFrame.from_dict(info_dct)
    df = pd.concat([df, new_df])

df

### NLP

In [None]:
#Libraries to import and general settings
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

In [None]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation]) #remove punctuations
    #tokens = re.split('\W+', text) or
    tokens = re.findall('\w+', text) #tokenize text
    text = [word for word in tokens if word not in stopwords] #remove stopwords
    return text

data['body_text_nostop'] = data['body_text'].apply(lambda x: clean_text(x.lower())) #application

data.head()

In [None]:
#Stemming tokenized text
def stemming(tokenized_list):
    text = [ps.stem(word) for word in tokenized_list]
    return text

data['body_text_stemmed'] = data['body_text_nostop'].apply(lambda x: stemming(x))

data.head()

In [None]:
#Lemmatizing tokenized text
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatizing(x))

data.head(10)

In [None]:
# Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer


count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_names())

In [None]:
# Convert the vectorized tokens to array and then to a dataframe
X_count_df = pd.DataFrame(X_count_sample.toarray())
#Assign the names of the vector columns to the Dataframe columns
X_count_df.columns = count_vect_sample.get_feature_names()
X_count_df

In [None]:
#### Complete Read & Clean raw text
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

In [None]:
#Build own Grid-Search

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)


def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))
#Run it

for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)