##### Importing all the required Libraries

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score, cross_val_predict
from pandas_profiling import ProfileReport
from sklearn import *

##### Importing Dataset

In [2]:
Dataset = pd.read_csv("article.csv")
Dataset.head()

Unnamed: 0,title,author,year,journal,pages
0,Spectre Attacks: Exploiting Speculative Execut...,Paul Kocher::Daniel Genkin::Daniel Gruss::Wern...,2018.0,meltdownattack.com,
1,Meltdown,Moritz Lipp::Michael Schwarz 0001::Daniel Grus...,2018.0,meltdownattack.com,
2,An Evaluation of Object-Oriented DBMS Developm...,Frank Manola,1994.0,GTE Laboratories Incorporated,
3,DARWIN: On the Incremental Migration of Legacy...,Michael L. Brodie::Michael Stonebraker,1993.0,GTE Laboratories Incorporated,
4,"Integrating Heterogeneous, Autonomous, Distrib...",Mark F. Hornick::Joe D. Morrison::Farshad Nayeri,1991.0,GTE Laboratories Incorporated,


#### Droping the Author and Pages Coloum

In [7]:
del Dataset['pages']
del Dataset['author']

In [8]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2423311 entries, 0 to 2423310
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   title    object 
 1   year     float64
 2   journal  object 
dtypes: float64(1), object(2)
memory usage: 55.5+ MB


#### Checking Dataset for Missing or Null values and processing them

In [10]:
# Replicating data into new dataframe
nandata = Dataset.interpolate(method ='linear', limit_direction ='backward') 

# Changing year from float to int
intyear = nandata["year"].round().astype(int)

# Checking how many Null values we have in dataset
Dataset.isnull().sum()
nandata.isnull().sum()

# Assinging converted year to year coloum
nandata['year']=intyear
nandata.head()

Unnamed: 0,title,year,journal
0,Spectre Attacks: Exploiting Speculative Execut...,2018,meltdownattack.com
1,Meltdown,2018,meltdownattack.com
2,An Evaluation of Object-Oriented DBMS Developm...,1994,GTE Laboratories Incorporated
3,DARWIN: On the Incremental Migration of Legacy...,1993,GTE Laboratories Incorporated
4,"Integrating Heterogeneous, Autonomous, Distrib...",1991,GTE Laboratories Incorporated


In [11]:
# Dropping the Null values
newDataset = nandata.dropna()
newDataset.isnull().sum()

title      0
year       0
journal    0
dtype: int64

In [16]:
# Grouping the Dataset by Journal and Year
cleanDataset = newDataset.groupby(["journal", "year"])["title"].count().reset_index(name="Paper Per Year")

# Counting the occorance of the a Jorunal Name
sum(newDataset.groupby(["journal", "year"])["title"].count())
finalDataset = cleanDataset.sort_values(by = ['year'])
finalDataset

Unnamed: 0,journal,year,Paper Per Year
22756,J. Symb. Log.,1936,12
22757,J. Symb. Log.,1937,15
22758,J. Symb. Log.,1938,10
22759,J. Symb. Log.,1939,18
22760,J. Symb. Log.,1940,10
...,...,...,...
8719,Future Gener. Comput. Syst.,2021,54
22997,J. Ubiquitous Syst. Pervasive Networks,2021,8
7931,Eur. J. Comb.,2021,41
23997,Math. Comput.,2021,19


In [17]:
# Sorting the Dataset indexes
finalDataset = finalDataset.reset_index(drop=True)
finalDataset.head()

Unnamed: 0,journal,year,Paper Per Year
0,J. Symb. Log.,1936,12
1,J. Symb. Log.,1937,15
2,J. Symb. Log.,1938,10
3,J. Symb. Log.,1939,18
4,J. Symb. Log.,1940,10


In [18]:
# Converting the Journal Coloum into number categories so we can pass them into a model
finalDataset["journal"] = finalDataset["journal"].astype('category').cat.codes

In [19]:
finalDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30221 entries, 0 to 30220
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   journal         30221 non-null  int16
 1   year            30221 non-null  int64
 2   Paper Per Year  30221 non-null  int64
dtypes: int16(1), int64(2)
memory usage: 531.4 KB


In [20]:
finalDataset.head()

Unnamed: 0,journal,year,Paper Per Year
0,1433,1936,12
1,1433,1937,15
2,1433,1938,10
3,1433,1939,18
4,1433,1940,10


In [22]:
# Checking how many unique journals we have
len(finalDataset['journal'].unique())

1873

In [73]:
# Sipliting the Dataset before 2016 for training and testing purpose and data from 2017 to 2019 as unseen data.
SampleData = finalDataset[finalDataset['year'] <= 2016]
UnseenData = finalDataset[(finalDataset['year']>2016) & (finalDataset['year']<=2019)]

In [74]:
# Sipliting data into 80:20 ratio for training and testing
X_train, X_test, y_train, y_test = train_test_split(SampleData[['journal', 'year']],SampleData['Paper Per Year'],test_size = 0.2 , random_state = 0)

### Training Decision Tree Regressor

In [75]:
DTR = DecisionTreeRegressor(random_state = 0)
DTR.fit(X_train,y_train)

#Predicting the Pubication in a year using test set 
Predictions = DTR.predict(X_test)

#Decision Tree Regression Accuracy with test set
print('Decision Tree Regression Accuracy: ', DTR.score(X_test,y_test))

Decision Tree Regression Accuracy:  0.8692556847184814


#### Using K-Fold cross validation to validate the above code

In [76]:
#To use the K-fold cross validation we have to convet the data into array
X = np.asarray(SampleData[['journal', 'year']])
y = np.asarray(SampleData['Paper Per Year'])

#Predicting the SalePrice using cross validation (KFold method)
Predictions = cross_val_predict(DTR, X, y, cv=10 )
#Decision Tree Regression Accuracy with cross validation
accuracy_dt = metrics.r2_score(y, Predictions)
print('Cross-Predicted(KFold) Decision Tree Regression Accuracy: ', accuracy_dt)

Cross-Predicted(KFold) Decision Tree Regression Accuracy:  0.8976959152531453


#### Checking it on Unseen data

In [79]:
X_Unseen = UnseenData[['journal', 'year']]
Y_Unseen = UnseenData['Paper Per Year']

Unseen_Prediction = DTR.predict(X_Unseen)

#Decision Tree Regression Accuracy with test set
print('Decision Tree Regression Accuracy: ', DTR.score(X_Unseen,Y_Unseen))

Decision Tree Regression Accuracy:  0.7251742456501419


#### Testing with K-Fold on Unseen data

In [80]:
#To use the K-fold cross validation we have to convet the data into array
X = np.asarray(UnseenData[['journal', 'year']])
y = np.asarray(UnseenData['Paper Per Year'])

#Predicting the SalePrice using cross validation (KFold method)
Predictions = cross_val_predict(DTR, X, y, cv=10 )
#Decision Tree Regression Accuracy with cross validation
accuracy_dt = metrics.r2_score(y, Predictions)
print('Cross-Predicted(KFold) Decision Tree Regression Accuracy: ', accuracy_dt)

Cross-Predicted(KFold) Decision Tree Regression Accuracy:  0.9082661901051796


###  Training Random Forest Regression

In [82]:
RFR = RandomForestRegressor(n_estimators = 300 ,  random_state = 0)
RFR.fit(X_train,y_train)

#Predicting the SalePrices using test set 
Predictions = RFR.predict(X_test)

#Random Forest Regression Accuracy with test set
print('Random Forest Regression Accuracy: ', RFR.score(X_test,y_test))

#To use the K-fold cross validation we have to convet the data into array
X = np.asarray(SampleData[['journal', 'year']])
y = np.asarray(SampleData['Paper Per Year'])

#Predicting the SalePrice using cross validation (KFold method)
Predictions = cross_val_predict(DTR, X, y, cv=10 )
#Decision Tree Regression Accuracy with cross validation
accuracy_dt = metrics.r2_score(y, Predictions)
print('Cross-Predicted(KFold) Decision Tree Regression Accuracy: ', accuracy_dt)

Random Forest Regression Accuracy:  0.8840635985433466
Cross-Predicted(KFold) Decision Tree Regression Accuracy:  0.8976959152531453


#### Testing Random Forest on Unseen data

In [83]:
X_Unseen = UnseenData[['journal', 'year']]
Y_Unseen = UnseenData['Paper Per Year']

Unseen_Prediction = RFR.predict(X_Unseen)

#Decision Tree Regression Accuracy with test set
print('Decision Tree Regression Accuracy: ', RFR.score(X_Unseen,Y_Unseen))

Decision Tree Regression Accuracy:  0.6665075811322212


In [84]:
#To use the K-fold cross validation we have to convet the data into array
X = np.asarray(UnseenData[['journal', 'year']])
y = np.asarray(UnseenData['Paper Per Year'])

#Predicting the SalePrice using cross validation (KFold method)
Predictions = cross_val_predict(RFR, X, y, cv=10 )
#Decision Tree Regression Accuracy with cross validation
accuracy_dt = metrics.r2_score(y, Predictions)
print('Cross-Predicted(KFold) Decision Tree Regression Accuracy: ', accuracy_dt)

Cross-Predicted(KFold) Decision Tree Regression Accuracy:  0.6809382167263347
