In [356]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score


In [357]:
train_data = pd.read_excel('challenge_trainning_2021.xlsx', names=['subject_value_lable', 'text'])


In [358]:
train_data.head()

Unnamed: 0,subject_value_lable,text
0,Social Sciences,\nIn the current debate on inequality in Afric...
1,Social Sciences,\nIn this paper I propose that happiness can b...
2,Medicine & Public Health,\nChronic migraine is a debilitating primary h...
3,Medicine & Public Health,\nLiver and gastrointestinal disorders during ...
4,Medicine & Public Health,"\nTraditionally, weight-loss lifestyle-modific..."


In [359]:
train_data.subject_value_lable.unique()

array(['Social Sciences', 'Medicine & Public Health', 'Psychology',
       'Biomedicine'], dtype=object)

In [360]:
train_data.shape

(8000, 2)

In [361]:
train_data.drop_duplicates(inplace=True)

In [362]:
train_data.shape

(7999, 2)

## Preprocessing the text


In [364]:
train_data['text'] = train_data['text'].str.strip()


In [365]:
STOPWORDS = set(stopwords.words('english'))

In [368]:
def clean_text(text):
    #text = text.lower()
    text = re.sub(re.compile(r'\s+([\.\,\;\:])', re.DOTALL), r'\1', text)
    text = re.sub(re.compile(r' ([\t]+)', re.DOTALL), r' ', text)
    text = re.sub(re.compile(r'([a-zA-Z])\s+([a-zA-Z]+)', re.DOTALL), r'\1 \2', text)
    text = re.sub(re.compile(r'\n\s+([a-zA-Z]+)', re.DOTALL), r' \1', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [369]:
train_data['text'] = train_data['text'].apply(clean_text)

In [370]:
writer = pd.ExcelWriter('clean_data.xlsx')
train_data.to_excel(writer)
writer.save()

In [371]:
train_data.head()

Unnamed: 0,subject_value_lable,text
0,Social Sciences,In the current debate on inequality in Africa ...
1,Social Sciences,In this paper I propose that happiness can be ...
2,Medicine & Public Health,Chronic migraine is a debilitating primary hea...
3,Medicine & Public Health,Liver and gastrointestinal disorders during pr...
4,Medicine & Public Health,"Traditionally, weight-loss lifestyle-modificat..."


In [372]:
print(train_data['text'].apply(lambda x: len(x.split(' '))).sum())

7899051


## Train Test split


In [373]:
x = train_data.text
y = train_data.subject_value_lable
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 111)

In [374]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((6399,), (1600,), (6399,), (1600,))

## Applying Logistic Regression


In [375]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred1,y_test)}")

Accuracy is : 0.89


## Save the model

In [376]:
import pickle

model_filename = "subject_classifier.mdl"

saved_model = pickle.dump(lr, open(model_filename,'wb'))

print('Model is saved into to disk successfully Using Pickle')



Model is saved into to disk successfully Using Pickle


## load the model and run on test-data

In [377]:
model_filename = "subject_classifier.mdl"
my_lr_model = pickle.load(open(model_filename, 'rb'))
result = my_lr_model.predict(X_test)
result

array(['Social Sciences', 'Social Sciences', 'Medicine & Public Health',
       ..., 'Psychology', 'Social Sciences', 'Biomedicine'], dtype=object)

## run on given test sample data

In [378]:
test_data = pd.read_excel('test.xlsx', index_col=0, header=0)
test_data['text'] = test_data['text'].str.strip()
test_data.head()

Unnamed: 0_level_0,text
ID,Unnamed: 1_level_1
0,This chapter contains a scenario in which a fe...
1,Vaccines currently routinely recommended for p...
2,With the development of a 5-year integrated va...
3,Clients with eating disorders (ED) make up a g...
4,Several different viruses and some bacterial p...


In [379]:
result = my_lr_model.predict(test_data['text'])

In [380]:
predicted_label = pd.DataFrame(result, columns=['subject_value'])

In [272]:
predicted_label

Unnamed: 0,subject_value
0,Medicine & Public Health
1,Medicine & Public Health
2,Medicine & Public Health
3,Psychology
4,Medicine & Public Health
5,Biomedicine
6,Biomedicine
7,Psychology
8,Psychology
9,Social Sciences


## save result in csv format

In [381]:
predicted_label.to_csv("test_result.csv")

Unnamed: 0,subject_value,text
0,,This chapter contains a scenario in which a fe...
1,,Vaccines currently routinely recommended for p...
2,,With the development of a 5-year integrated va...
3,,Clients with eating disorders (ED) make up a g...
4,,Several different viruses and some bacterial p...
5,,“Toward a Synthetic Genome” Section\n ...
6,,"Capturing videos anytime and anywhere, and the..."
7,,Emotions are debated daily in scientific and s...
8,,Dementia is characterised by cognitive impairm...
9,,This conclusion summarizes and assesses change...


## save result in excel format

In [382]:
result = pd.concat([predicted_label, test_data], axis=1, join='inner', names = ['ID', 'subject_value', 'text'])
#display(result)

writer = pd.ExcelWriter('test_result.xlsx')
result.to_excel(writer)
writer.save()
print("DataFrame is exported successfully to 'converted-to-excel.xlsx' Excel File.")
result.to_csv("test_result.csv")

DataFrame is exported successfully to 'converted-to-excel.xlsx' Excel File.


## manual test

In [254]:
text = '''PCIT requires adept, in-the-moment application of theory and techniques during situations that may be stressful for both the caregiver and therapist (e.g., a child tantrum). This is compounded by the fact that many therapists will have no previous experience with in-vivo coaching, one of the cornerstone attributes of PCIT. For therapists to develop this skillset, they will require both extensive training and sufficient supervision to support the application of PCIT with initial clients. These tasks are paramount to the goal of increasing the reach of PCIT to those who need it. The purpose of this chapter is, therefore, to summarize the current state of the research on both EBT training and supervision in general and PCIT training specifically and discuss the implications of such research on the process of PCIT dissemination.
Unfortunately, very few studies have specifically focused on PCIT therapist training outcomes. However, the research on EBT trainings in general can still be informative, and are included here. Overall, the main goals of PCIT training and supervision include not only teaching skills necessary to conduct PCIT but also ensuring trainees can overcome barriers to implementing the treatment with fidelity with appropriate families. A strong training program is essential to meeting these goals, as research suggests that, although studying the PCIT manual itself is helpful, it is not enough for trainees to develop adequate PCIT competency (Herschell et al., 2009). Furthermore, Beveridge et al. (2015) have stressed that, beyond covering the specific components of PCIT, training also needs to address therapist and agency barriers (see below) to successful PCIT utilization, while Christian, Niec, Acevedo-Polakovich, and Kassab (2014) wrote that “the lack of effective communication [with agencies], agency readiness, as well as clinician factors, create[s] or maintain[s] barriers to completing [PCIT] training” (p. 15).
Although the research literature has not adequately delineated what trainee attributes (e.g., education level, clinical experience, caseload) predict more successful training outcomes—and in many cases studies do not fully describe these characteristics in the training sample (Beveridge et al., 2015)—some trainee barriers are notable. While a recent survey found generally favorable views towards manualized treatments in child advocacy center workers (Staudt & Williams-Hayes, 2011), not all therapists (and thus not all trainees) will enter trainings with high levels of confidence in manualized, evidence-based treatments such as PCIT. Clinicians with more years of experience tend to have more negative attitudes towards manualized treatments (Barry et al., 2008; Becker, Smith, & Jensen-Doss, 2013), which may reflect shifting attitudes towards EBTs in treatment programs over time. Shafran et al. (2009) note that clinician attitudes which may be barriers to the dissemination of evidence-based practice include the belief that research studies do not sufficiently relate to the characteristics of actual clinical practice, that therapist attributes are more important to treatment outcomes than specific treatments, or that choosing specific components of treatments to match client needs is more valuable than following specific protocols. Clinicians may also incorrectly believe that comorbidity reduces the effectiveness of evidence-based protocols (Shafran et al., 2009). For PCIT specifically, trainees have described certain components of the PCIT protocol (e.g., the mastery criteria) as barriers to implement PCIT in their practice; in this same study, clinicians who dropped out of training or failed to meet mastery criteria were less likely to report positive views of core PCIT components such as coaching, mastery criteria, CDI and PDI teaching sessions, and co-therapy (Christian et al., 2014).'''
my_lr_model.predict([text])[0]




'Psychology'

In [383]:
text = '''In the field of nanoscience, a series of surface chemical modifications or coatings have been designed to acquire the new physicochemical properties or to enhance the certain aspects of physicochemical properties. Meanwhile, some of the surface chemical modifications or coatings also have the beneficial effects in reducing the potential toxicity of nanomaterials. In this chapter, we focus on three important surface chemical modifications, polyethylene glycol (PEG) modification, ZnS or fetal bovine serum (FBS) surface coating, and carboxyl (–COOH) modification, to introduce the beneficial effects of these surface chemical modifications in reducing the toxicity of nanomaterials in nematodes.
In the field of nanoscience, the main effort to reduce the toxicity of engineered nanomaterials (ENMs) is to design the suitable modification on the surface of examined ENMs [1–4]. Actually, reduction of the potential toxicity of certain ENMs is normally not the only research aim for surface chemical modifications. The aims to design certain surface chemical modifications usually contain both the toxicity reduction and the acquirement of new physicochemical properties or the enhancement of certain aspects of physicochemical properties [5–14]. One of the chemical modifications is to design chemical binding of certain group(s) [15, 16]. Another important chemical modification is to design certain form of coating to cover the examined ENMs [17, 18]. So far, besides the acquirement of new physicochemical properties or the enhancement of certain aspects of physicochemical properties, the increasing evidence on nanotoxicology has prompted the attention for chemists or material scientists the necessity to consider thoroughly the safety of ENMs potentially used in different industrial and medical applications.
In this chapter, we mainly focus on three important chemical modifications, polyethylene glycol (PEG) modification, ZnS or fetal bovine serum (FBS) surface coating, and carboxyl (–COOH) modification, to introduce the different aspects of certain surface chemical modifications to reduce the potential toxicity of ENMs in nematodes.
PEG surface modification is a frequently employed strategy to acquire the new physicochemical properties and to reduce the toxicity of ENMs [19–23]. More importantly, it has been indicated that the surface PEGylation could provide a “stealth” characteristic for ENMs; otherwise it would be identified as foreign materials by the body of organisms [23, 24].
Along with the worldwide GO production, it is possible that a large amount of GO could be released into the environment, and GO would pose a significant environmental and health risk. PEG surface modification is a normally used chemical strategy to reduce the GO toxicity [25, 26].
Chronic exposure to GO-PEG (1 mg/L) from L1-larvae to adult day-8 did not cause the toxicity on locomotion behavior as reflected by the endpoints of head thrash and body bend in nematodes [27] (Fig. 10.1). Similarly, chronic exposure to GO-PEG (1 mg/L) also did not result in the noticeable inductions of either intestinal autofluorescence or intestinal reactive oxygen species (ROS) production in nematodes [27] (Fig. 10.1). In contrast, chronic exposure to GO (1 mg/L) led to the significant decrease in locomotion behavior and the significant induction of both intestinal autofluorescence and intestinal ROS production in nematodes [27] (Fig. 10.1). These results demonstrate that PEG surface modification can be helpful for maintaining the normal functional state of both primary and secondary targeted organs in GO-exposed nematodes.
                  
In nematodes, chronic exposure to GO from L1-larvae to adult day-8 caused the severe GO deposition in the body, as well as the severe OP50 accumulation in the intestine, which is considered to be closely associated with the possible decrease in innate immune response of animals [27]. In contrast, PEG surface modification noticeably suppressed the GO deposition in the body, including the intestine, in nematodes [27]. More importantly, PEG surface modification obviously inhibited the OP50 accumulation in the intestine in nematodes [27]. Meanwhile, PEG surface modification significantly recovered the expression patterns of dysregulated genes encoding antimicrobial peptides or p38 MAPK signaling pathway induced by GO chronic exposure [27]. That is, PEG surface modification may have the potential to prevent the OP50 accumulation and to maintain the normal intestinal innate immune response in GO-exposed nematodes.
In nematodes, AVL neurons in the head and DVB neurons in the tail are required for the control of defecation behavior [28]. GO exposure could cause the significant increase in mean defecation cycle length in nematodes [27, 29]. Additionally, GO exposure resulted in the developmental deficit in both the AVL neurons and the DVB neurons as indicated by the decrease in fluorescent size of AVL or DVB neurons in GO-exposed nematodes [27]. In contrast, PEG surface modification could be helpful for nematodes to maintain the normal developmental state of both the AVL neurons and the DVB neurons [27]. Moreover, the normal mean defecation cycle length was observed in PEG-GO-exposed nematodes after chronic exposure from L1-larvae to adult day-8 [27].
A cellular mechanism has been raised to explain these observations. In GO-exposed nematodes, the obvious colocalization of Rho B-labeled GO with AVL or DVB neurons has been detected [27]. Different from this, PEG surface modification inhibited the colocalization of Rho B-labeled GO with AVL or DVB neurons [27].
An important molecular mechanism raised to explain the beneficial effect of PEG surface modification to reduce the GO toxicity is that PEG modification may reduce the GO toxicity by influencing the functions of specific lncRNAs in nematodes [30]. After the comparison of 34 candidate long noncoding RNAs (lncRNAs) in control, GO- or GO-PEG-exposed nematodes, expression patterns of some dysregulated lncRNAs induced by GO exposure could be reversed by PEG surface modification [30]. PEG surface modification could increase the expressions of linc-37, linc-5, linc- 24, linc-14, XLOC_013642, XLOC_010849, and XLOC_004416 and decrease the expression of XLOC_007959 in GO-exposed nematodes [30].
Another corresponding molecular mechanism has also been raised to explain the observations introduced above. In nematodes, GO exposure dysregulated the expression levels of some genes required for the control of defecation behavior [27, 29]. In contrast, PEG surface modification could be helpful for nematodes to maintain the normal expression patterns of genes required for the control of defecation behavior including those dysregulated by GO exposure [27]."
'''
my_lr_model.predict([text])[0]

'Biomedicine'

## Applying Naive Bayes Classifier

In [384]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

y_pred = naivebayes.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')

accuracy 0.860625
