In [1]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Importing data
import pandas as pd 
# Local directory
Reviewdata = pd.read_csv('reviews2csv.csv')
Reviewdata=Reviewdata.astype(str)

In [4]:
#Removing columns
Reviewdata.drop(columns = ['paper', 'review','timespan','remarks','lan','assessment','orientation'], inplace = True)

In [5]:
Reviewdata.shape

(408, 5)

In [6]:
Reviewdata.head()

Unnamed: 0,id,preliminary_decision,confidence,id.1,text
0,1.0,accept,4.0,1.0,- The article deals with a contingent and very...
1,,accept,4.0,2.0,The article presents practical recommendations...
2,,accept,5.0,3.0,The topic is very interesting and a guide to i...
3,2.0,accept,4.0,1.0,An experience of using ICT for academic collab...
4,,accept,4.0,2.0,


In [7]:
### Checking Missing values 

count = Reviewdata.isnull().sum().sort_values(ascending=False)
percentage = ((Reviewdata.isnull().sum()/len(Reviewdata)*100)).sort_values(ascending=False)
missing_data = pd.concat([count, percentage], axis=1,
keys=['Count','Percentage'])

print('Count and percentage of missing values for the columns:')

missing_data

Count and percentage of missing values for the columns:


Unnamed: 0,Count,Percentage
text,0,0.0
id.1,0,0.0
confidence,0,0.0
preliminary_decision,0,0.0
id,0,0.0


In [8]:
### Checking for the percentage Distribution  ###
import matplotlib.pyplot as plt
%matplotlib inline
print('Percentage for default\n')
print(round(Reviewdata.preliminary_decision.value_counts(normalize=True)*100,4))
round(Reviewdata.preliminary_decision.value_counts(normalize=True)*100,4).plot(kind='bar')
plt.title('Percentage Distributions by review type')
plt.show()

Percentage for default

accept             64.7059
reject             29.9020
probably reject     4.9020
no decision         0.4902
Name: preliminary_decision, dtype: float64


TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [10]:
# Apply first level cleaning
import re
import string

#This function converts to lower-case,removes numbers and punctuation, removes square bracket 
def text_clean_1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

cleaned1 = lambda x: text_clean_1(x)

In [11]:
Reviewdata=Reviewdata.dropna()

In [12]:
Reviewdata['cleaned_review'] = pd.DataFrame(Reviewdata.text.apply(cleaned1))
Reviewdata.head(10)

Unnamed: 0,id,preliminary_decision,confidence,id.1,text,cleaned_review
0,1.0,accept,4.0,1.0,- The article deals with a contingent and very...,the article deals with a contingent and very ...
1,,accept,4.0,2.0,The article presents practical recommendations...,the article presents practical recommendations...
2,,accept,5.0,3.0,The topic is very interesting and a guide to i...,the topic is very interesting and a guide to i...
3,2.0,accept,4.0,1.0,An experience of using ICT for academic collab...,an experience of using ict for academic collab...
4,,accept,4.0,2.0,,
5,,accept,4.0,3.0,The authors describe a methodology for collabo...,the authors describe a methodology for collabo...
6,3.0,accept,4.0,1.0,This work proposes a new approach based on [25...,this work proposes a new approach based on to...
7,,accept,3.0,2.0,This paper aims to show new deployment alterna...,this paper aims to show new deployment alterna...
8,,accept,3.0,3.0,The paper is well structured. It follows a log...,the paper is well structured it follows a logi...
9,4.0,accept,4.0,1.0,Se realiza un trabajo de modelamiento de encri...,se realiza un trabajo de modelamiento de encri...


In [13]:
# Apply a second round of cleaning
def text_clean_2(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

cleaned2 = lambda x: text_clean_2(x)

In [14]:
Reviewdata['cleaned_review_new'] = pd.DataFrame(Reviewdata['cleaned_review'].apply(cleaned2))
Reviewdata.head(10)

Unnamed: 0,id,preliminary_decision,confidence,id.1,text,cleaned_review,cleaned_review_new
0,1.0,accept,4.0,1.0,- The article deals with a contingent and very...,the article deals with a contingent and very ...,the article deals with a contingent and very ...
1,,accept,4.0,2.0,The article presents practical recommendations...,the article presents practical recommendations...,the article presents practical recommendations...
2,,accept,5.0,3.0,The topic is very interesting and a guide to i...,the topic is very interesting and a guide to i...,the topic is very interesting and a guide to i...
3,2.0,accept,4.0,1.0,An experience of using ICT for academic collab...,an experience of using ict for academic collab...,an experience of using ict for academic collab...
4,,accept,4.0,2.0,,,
5,,accept,4.0,3.0,The authors describe a methodology for collabo...,the authors describe a methodology for collabo...,the authors describe a methodology for collabo...
6,3.0,accept,4.0,1.0,This work proposes a new approach based on [25...,this work proposes a new approach based on to...,this work proposes a new approach based on to...
7,,accept,3.0,2.0,This paper aims to show new deployment alterna...,this paper aims to show new deployment alterna...,this paper aims to show new deployment alterna...
8,,accept,3.0,3.0,The paper is well structured. It follows a log...,the paper is well structured it follows a logi...,the paper is well structured it follows a logi...
9,4.0,accept,4.0,1.0,Se realiza un trabajo de modelamiento de encri...,se realiza un trabajo de modelamiento de encri...,se realiza un trabajo de modelamiento de encri...


In [15]:
from sklearn.model_selection import train_test_split

x = Reviewdata.cleaned_review_new
y= Reviewdata.preliminary_decision



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 800)

print('X_train :', len(x_train))
print('X_test  :', len(x_test))
print('Y_train :', len(y_train))
print('Y_test  :', len(y_test))

X_train : 326
X_test  : 82
Y_train : 326
Y_test  : 82


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")


from sklearn.pipeline import Pipeline

In [18]:
model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

model.fit(x_train, y_train)


from sklearn.metrics import confusion_matrix

predictions = model.predict(x_test)

confusion_matrix(predictions, y_test)

array([[58,  2, 19],
       [ 0,  0,  0],
       [ 0,  0,  3]], dtype=int64)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(predictions, y_test))
print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))
print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))

Accuracy :  0.7439024390243902
Precision :  0.9684035476718403
Recall :  0.7439024390243902


In [20]:
input =["innovative proposal for the application"]
result = model.predict(input)

print(result)

['accept']
