## Building a regression model

In [1]:
from datetime import datetime

In [2]:
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv('data.csv')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Date,Video_name,Views,Likes,Comments,Hashtags,Duration
0,0,2021-12-26T15:43:47Z,Spiderman fans please explain yourselves..,1491856,128992,4128,['ad\n\nNew Tsuki has launched ðŸ‘˜: https://bit....,PT10M30S
1,1,2021-12-22T19:11:27Z,Next Gen Graphics Is Kinda Nuts..,2678694,168091,6323,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT11M27S
2,2,2021-12-20T14:01:11Z,"You Laugh You Win A BILLION $$$ (maybe, its no...",3135226,248601,5265,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT10M14S
3,3,2021-12-20T15:59:32Z,Five Nights at Freddy's Security Breach Gamepl...,2058838,137306,4842,['AD - Get my NEW G FUEL Pewdiepie SnÃ¶ Shaker!...,PT19M29S
4,4,2021-12-17T15:16:43Z,I Found The New Forbidden Minecraft Block..,2241987,165961,5810,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT13M25S
5,5,2021-12-16T16:47:49Z,NFT Has hit a new low...,2409866,178618,6933,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT8M20S
6,6,2021-12-15T17:35:00Z,Gary Vee is a Wee Bit Cringe,1944015,127706,6786,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT17M10S
7,7,2021-12-14T16:00:15Z,I Fell Into The New Minecraft Lush Cave!,3289589,235803,7494,['AD - Get the GHOST PewDiePie Tambourine Comb...,PT16M56S
8,8,2021-12-13T14:06:47Z,Bad Design Deserves To Go To Jail.,3678446,266438,5198,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT10M3S
9,9,2021-12-10T14:12:39Z,They figured out atoms existed thousands of ye...,2046749,179363,8609,['Subscribe\U0001f9ce\nðŸ¥¤Gfuel(affiliate): http...,PT11M59S


In [5]:
def convert_to_list(tags):
    tags = tags.strip('][').split(', ')
    ans = ""
    for i in tags:
        ans = ans + i[1:-1]
    return ans


In [6]:
df['Hashtags'] = df['Hashtags'].apply(lambda x:convert_to_list(x))


In [7]:
def duration_to_time(duration):
    times = duration.split('M')
    if len(times)>=2 and times[1]!='':
        time = float(times[0][2:]) + float(times[1][:-1])/60
    else:
        time = float(times[0][2:])
    return round(time,2)


In [8]:
df['Duration'] = df['Duration'].apply(lambda x:duration_to_time(x))


In [9]:
import calendar

In [10]:
def day_of_week(date):
    time_ = date.split('T')
    date = time_[0]
    date = datetime.strptime(date, '%Y-%m-%d').weekday()
    return date

In [11]:
df['Day'] = df['Date'].apply(lambda x:day_of_week(x))


In [12]:
def find_time(time):
    time_ = time.split('T')
    time_fin = time_[1]
    time_fin = int(time_fin[0:2])*60 + int(time_fin[3:5])
    return time_fin


In [13]:
df['Time'] = df['Date'].apply(lambda x:find_time(x))


In [14]:
def find_day(day):
    day_list = day.split('T')
    day_fin = day_list[0]
    day_fin = (int(day_fin[5:7])-1)*30 + int(day_fin[8:10])
    return day_fin


In [15]:
df['Days'] = df['Date'].apply(lambda x:find_day(x))


In [16]:
from sklearn.linear_model import LinearRegression


## Text Processing

In [17]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))


def remove_stop(title,stop_words):
    new_sent = ""
    words = word_tokenize(title)
    for i in words:
        if i not in stop_words:
            new_sent = new_sent + i
            new_sent = new_sent + " "
    return new_sent


In [18]:
import re
def clean_tweet(text):
    text1 = re.sub(r'@[A-Za-z0-9]+','',text)
    text2 = re.sub(r'#','',text1)
    hashtags = re.findall(r'#[A-Za-z0-9]+',text)
    hashtags = [i[1:] for i in hashtags]
    text3 = re.sub(r'RT[\s]+','',text2)
    text4 = re.sub(r'https?:\/\/\S+','',text3)
    text5 = re.sub(r'http?:\/\/\S+','',text4)
    text6 = re.sub(r'\n','',text5)
    return text6


In [19]:
import preprocessor as p
def remove_emoticons(tweet):
    tweet = p.clean(tweet)
    return tweet

In [20]:
from nltk.stem import WordNetLemmatizer


def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [21]:
df['Video_name'] = df['Video_name'].str.lower()
df['Video_name'] = df['Video_name'].apply(lambda x:remove_stop(x,stop_words))
df['Video_name'] = df['Video_name'].apply(lambda x:clean_tweet(x))
df['Video_name'] = df['Video_name'].apply(lambda x:remove_emoticons(x))
df['Video_name'] = df['Video_name'].apply(lambda x:lemmatize_words(x))
df['Hashtags'] = df['Hashtags'].str.lower()



In [22]:
df['ratio'] = (df['Likes']/df['Views'])*100

In [23]:
df

Unnamed: 0.1,Unnamed: 0,Date,Video_name,Views,Likes,Comments,Hashtags,Duration,Day,Time,Days,ratio
0,0,2021-12-26T15:43:47Z,spiderman fan please explain ..,1491856,128992,4128,ad\n\nnew tsuki has launched ðŸ‘˜: https://bit.ly...,10.5,6,943,356,8.646411
1,1,2021-12-22T19:11:27Z,next gen graphic kinda nut ..,2678694,168091,6323,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,11.45,2,1151,352,6.27511
2,2,2021-12-20T14:01:11Z,"laugh win billion $ $ $ ( maybe , impossible w...",3135226,248601,5265,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,10.23,0,841,350,7.929285
3,3,2021-12-20T15:59:32Z,five night freddy 's security breach gameplay ...,2058838,137306,4842,ad - get my new g fuel pewdiepie snÃ¶ shaker!!:...,19.48,0,959,350,6.669102
4,4,2021-12-17T15:16:43Z,found new forbidden minecraft block ..,2241987,165961,5810,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,13.42,4,916,347,7.402407
5,5,2021-12-16T16:47:49Z,nft hit new low ...,2409866,178618,6933,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,8.33,3,1007,346,7.411947
6,6,2021-12-15T17:35:00Z,gary vee wee bit cringe,1944015,127706,6786,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,17.17,2,1055,345,6.569188
7,7,2021-12-14T16:00:15Z,fell new minecraft lush cave !,3289589,235803,7494,ad - get the ghost pewdiepie tambourine comboðŸ‘Š...,16.93,1,960,344,7.16816
8,8,2021-12-13T14:06:47Z,bad design deserves go jail .,3678446,266438,5198,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,10.05,0,846,343,7.243222
9,9,2021-12-10T14:12:39Z,figured atom existed thousand year ago still d...,2046749,179363,8609,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,11.98,4,852,340,8.763312


## Vectorization and train test

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vect = TfidfVectorizer(ngram_range=[1,2]).fit(df['Video_name'].iloc[:-1])

In [26]:
X_train = vect.transform(df['Video_name'].iloc[:-1])

In [27]:
X_train_2 = df[['Duration','Day','Time','Days']].iloc[:-1]

In [28]:
X_train_main = pd.merge(pd.DataFrame(X_train.toarray()), X_train_2, left_index=True, right_index=True)

In [29]:
y_train= df['Views'].iloc[:-1]

In [30]:
X_test_fin = pd.DataFrame({'Duration':[12.30],'Day':[5],'Time':[1430],'Days':[334]})
x = ["protective brother spy sister"]
x = vect.transform(x)
X_test_1 = pd.DataFrame(x.toarray())
X_test_2 = X_test_fin
X_test_final = pd.merge(X_test_1, X_test_2, left_index=True, right_index=True)

### Linear Regression

In [31]:
reg = LinearRegression().fit(X_train,y_train)

In [32]:
import numpy as np

In [33]:
reg2 = LinearRegression().fit(X_train_2,y_train)

In [34]:
# !pip install mlxtend

In [35]:
from mlxtend.regressor import StackingRegressor
lr = LinearRegression()
sclf = StackingRegressor(regressors=[reg, reg2], 
                      meta_regressor=lr)

In [36]:
sclf.fit(X_train_main,y_train)

StackingRegressor(meta_regressor=LinearRegression(),
                  regressors=[LinearRegression(), LinearRegression()])

In [37]:
sclf.predict(X_test_final)

array([2457413.62398941])

## Decision Trees

In [38]:
from sklearn.tree import DecisionTreeRegressor

In [39]:
reg_tree_1 = DecisionTreeRegressor().fit(X_train,y_train)

In [40]:
reg_tree_2 = DecisionTreeRegressor().fit(X_train_2,y_train)

In [41]:
d_reg = DecisionTreeRegressor()
d_tree_reg = StackingRegressor(regressors=[reg_tree_1, reg_tree_2], 
                      meta_regressor=d_reg)

In [42]:
d_tree_reg.fit(X_train_main,y_train)

StackingRegressor(meta_regressor=DecisionTreeRegressor(),
                  regressors=[DecisionTreeRegressor(), DecisionTreeRegressor()])

In [43]:
d_tree_reg.predict(X_test_final)

array([2750518.])

## Support Vector Regressor

In [44]:
from sklearn.svm import SVR

In [45]:
reg_svr_1 = SVR().fit(X_train,y_train)

In [46]:
reg_svr_2 = SVR().fit(X_train_2,y_train)

In [47]:
reg_svr = SVR()
svr_reg = StackingRegressor(regressors=[reg_svr_1, reg_svr_2], 
                      meta_regressor=reg_svr)

In [48]:
svr_reg.fit(X_train_main,y_train)

StackingRegressor(meta_regressor=SVR(), regressors=[SVR(), SVR()])

In [49]:
svr_reg.predict(X_test_final)

array([2705303.9060248])

## Lasso Regression

In [50]:
from sklearn.linear_model import Lasso

In [51]:
lasso_reg_1 = Lasso().fit(X_train,y_train)

In [52]:
lasso_reg_2 = Lasso().fit(X_train_2,y_train)

In [53]:
reg_lasso = Lasso()
reg_lasso_fin = StackingRegressor(regressors=[lasso_reg_1, lasso_reg_2], 
                      meta_regressor=reg_lasso)

In [54]:
reg_lasso_fin.fit(X_train_main,y_train)

StackingRegressor(meta_regressor=Lasso(), regressors=[Lasso(), Lasso()])

In [55]:
reg_lasso_fin.predict(X_test_final)

array([3017709.47085491])

## Random Forest Regressor

In [56]:
from sklearn.ensemble import RandomForestRegressor

In [57]:
rf_reg_1 = RandomForestRegressor().fit(X_train,y_train)

In [58]:
rf_reg_2 = RandomForestRegressor().fit(X_train_2,y_train)

In [59]:
reg_rf = RandomForestRegressor()
reg_rf_fin = StackingRegressor(regressors=[rf_reg_1, rf_reg_2], 
                      meta_regressor=reg_rf)

In [60]:
reg_rf_fin.fit(X_train_main,y_train)

StackingRegressor(meta_regressor=RandomForestRegressor(),
                  regressors=[RandomForestRegressor(), RandomForestRegressor()])

In [61]:
reg_rf_fin.predict(X_test_final)

array([2658607.26])

## Finding Best Video Names and other parameters

In [62]:
df

Unnamed: 0.1,Unnamed: 0,Date,Video_name,Views,Likes,Comments,Hashtags,Duration,Day,Time,Days,ratio
0,0,2021-12-26T15:43:47Z,spiderman fan please explain ..,1491856,128992,4128,ad\n\nnew tsuki has launched ðŸ‘˜: https://bit.ly...,10.5,6,943,356,8.646411
1,1,2021-12-22T19:11:27Z,next gen graphic kinda nut ..,2678694,168091,6323,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,11.45,2,1151,352,6.27511
2,2,2021-12-20T14:01:11Z,"laugh win billion $ $ $ ( maybe , impossible w...",3135226,248601,5265,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,10.23,0,841,350,7.929285
3,3,2021-12-20T15:59:32Z,five night freddy 's security breach gameplay ...,2058838,137306,4842,ad - get my new g fuel pewdiepie snÃ¶ shaker!!:...,19.48,0,959,350,6.669102
4,4,2021-12-17T15:16:43Z,found new forbidden minecraft block ..,2241987,165961,5810,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,13.42,4,916,347,7.402407
5,5,2021-12-16T16:47:49Z,nft hit new low ...,2409866,178618,6933,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,8.33,3,1007,346,7.411947
6,6,2021-12-15T17:35:00Z,gary vee wee bit cringe,1944015,127706,6786,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,17.17,2,1055,345,6.569188
7,7,2021-12-14T16:00:15Z,fell new minecraft lush cave !,3289589,235803,7494,ad - get the ghost pewdiepie tambourine comboðŸ‘Š...,16.93,1,960,344,7.16816
8,8,2021-12-13T14:06:47Z,bad design deserves go jail .,3678446,266438,5198,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,10.05,0,846,343,7.243222
9,9,2021-12-10T14:12:39Z,figured atom existed thousand year ago still d...,2046749,179363,8609,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,11.98,4,852,340,8.763312


In [71]:
df = df.sort_values(by = 'Views')

In [72]:
df

Unnamed: 0.1,Unnamed: 0,Date,Video_name,Views,Likes,Comments,Hashtags,Duration,Day,Time,Days,ratio
0,0,2021-12-26T15:43:47Z,spiderman fan please explain ..,1491856,128992,4128,ad\n\nnew tsuki has launched ðŸ‘˜: https://bit.ly...,10.5,6,943,356,8.646411
6,6,2021-12-15T17:35:00Z,gary vee wee bit cringe,1944015,127706,6786,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,17.17,2,1055,345,6.569188
9,9,2021-12-10T14:12:39Z,figured atom existed thousand year ago still d...,2046749,179363,8609,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,11.98,4,852,340,8.763312
3,3,2021-12-20T15:59:32Z,five night freddy 's security breach gameplay ...,2058838,137306,4842,ad - get my new g fuel pewdiepie snÃ¶ shaker!!:...,19.48,0,959,350,6.669102
4,4,2021-12-17T15:16:43Z,found new forbidden minecraft block ..,2241987,165961,5810,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,13.42,4,916,347,7.402407
5,5,2021-12-16T16:47:49Z,nft hit new low ...,2409866,178618,6933,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,8.33,3,1007,346,7.411947
14,14,2021-12-03T13:57:16Z,'s ? zamn !,2499168,208723,7762,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,8.35,4,837,333,8.351699
17,17,2021-11-30T00:59:03Z,... 're supposed eat pizza ..,2621938,200836,13260,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,20.17,1,59,330,7.65983
1,1,2021-12-22T19:11:27Z,next gen graphic kinda nut ..,2678694,168091,6323,subscribe\u0001f9ce\nðŸ¥¤gfuel(affiliate): https:...,11.45,2,1151,352,6.27511
18,18,2021-11-27T18:00:44Z,reacting worst plastic surgery,2705304,179627,5263,ad - go to https://nordvpn.com/pewdiepie to ge...,13.08,5,1080,327,6.639808


In [87]:
import nltk

In [122]:
from nltk.tag import pos_tag


def get_optimal_params(df):
    words_imp = []
    df_sorted = df.sort_values(by='Views')
    lst_names = list(df['Video_name'])[15:]
    for i in range(len(lst_names)):
        tagged_sent = pos_tag(lst_names[i].split())
        for word in tagged_sent:
            if word[1]=='NN':
                words_imp.append(word[0])
    duration = list(df_sorted['Duration'])[15:]
    duration = [int(x) for x in duration]
    return words_imp,round((sum(duration)/len(duration)),2)


In [123]:
get_optimal_params(df)

(['minecraft',
  'lush',
  'cave',
  'design',
  'jail',
  'minecraft',
  'update',
  'hate',
  'mukbang',
  'mr',
  'beast',
  'game',
  'review'],
 14.0)

In [1]:
string_ = 'PT14S'
ans = string_.split('M')
ans

['PT14S']