# LSRL Code

In [1]:
import pandas as pd
import nltk
import numpy as np

In [2]:
import bs4
import string

In [3]:
from collections import Counter

In [7]:
from nltk.corpus import stopwords

In [8]:
stopword = stopwords.words('english')

from string import punctuation

In [9]:

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Useful Functions

In [10]:
#create data frame
df = pd.read_excel('LSRL_Master2.xlsx')

In [11]:
# Access one column
# take a look at the citation counts, for instance
df['Extra'] 

0        2.0
1        5.0
2       14.0
3       17.0
4       18.0
        ... 
1045     NaN
1046     NaN
1047     NaN
1048     NaN
1049     NaN
Name: Extra, Length: 1050, dtype: float64

In [12]:
#what is the type of each column
df.dtypes 

                     object
Item Type            object
Publication Year      int64
Author               object
Author 1             object
                     ...   
Section             float64
Session             float64
Committee           float64
History             float64
Legislative Body    float64
Length: 100, dtype: object

In [13]:
# overall size of the sheet
df.shape

(1050, 100)

In [14]:
# remove rows using .drop()
# df1.drop([246, 247, 250, 255], axis = 0, inplace = True) 

In [15]:
# gives first five rows
df.head()

Unnamed: 0,Unnamed: 1,Item Type,Publication Year,Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,X2ZH3HVJ,bookSection,1972,"Rankin, Robert L.","Rankin, Robert L.",,,,,,...,,,,,,,,,,
1,63PPTCP7,bookSection,1972,"Saltarelli, Mario","Saltarelli, Mario",,,,,,...,,,,,,,,,,
2,8PLLMNAD,bookSection,1972,"Wanner, Dieter","Wanner, Dieter",,,,,,...,,,,,,,,,,
3,ABUSKF6A,bookSection,1972,"Rivero, Maria-Luisa","Rivero, María-Luisa",,,,,,...,,,,,,,,,,
4,H976X7J4,bookSection,1972,"Meyer, Paula L.","Meyer, Paula L.",,,,,,...,,,,,,,,,,


In [16]:
# all the details about this one row
df.loc[254]

                                 5P6HKL9W
Item Type                  journalArticle
Publication Year                     1984
Author              Callahan, Christopher
Author 1            Callahan, Christopher
                            ...          
Section                               NaN
Session                               NaN
Committee                             NaN
History                               NaN
Legislative Body                      NaN
Name: 254, Length: 100, dtype: object

## Clean up sheet

### 1. Removing null columns

In [17]:
# drop all columns (axis = 1) with all null values
# make it a copy to avoid SettingwithCopyWarning
df1 = df.dropna(axis = 1, how = 'all').copy() 

In [18]:
# new sheet has 43 columns
df1.shape

(1050, 43)

In [19]:
# remove rows with no publications
df1['key'].isnull().sum() # total 15 publications (2022)
df1.dropna(subset = ['key'], inplace = True) # remove them for now
df1.loc[len(df1)-1] # get the last item should be Gabriel Martínez

                                                                 DJMASDT6
Item Type                                                 conferencePaper
Publication Year                                                     2020
Author                                             Vera, Gabriel Martínez
Author 1                                           Vera, Gabriel Martínez
Author 2                                                              NaN
Author 3                                                              NaN
Author 4                                                              NaN
Author 5                                                              NaN
Author 6                                                              NaN
Title                   On recomplementation, high adverbs and V-movem...
key                     On recomplementation, high adverbs and V-movem...
Abstract                I address the structure of the Spanish left pe...
doi url                             ht

In [20]:
# new sheet has 1034 rows
df1.shape

(1035, 43)

### 2. Finding and removing duplicates

In [21]:
# Finding duplicates
df1[df1.duplicated(subset = ['key'])]

Unnamed: 0,Unnamed: 1,Item Type,Publication Year,Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,...,Library Catalog,Extra,Notes,Publication Type 2,File Attachments,Manual Tags,Editor,Meeting Name,Meeting Location,Conference Name
113,FKDHS8XD,bookSection,1978,"Reynolds, L.; Mez-Dombkowski, E.","Reynolds, L.","Mez-Dombkowski, E.",,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
116,LVXHD7LB,bookSection,1978,"Saltarelli, M.","Saltarelli, M.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
117,WDK32N3G,bookSection,1978,"Walker, D.","Walker, D.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
118,2GV9EJBE,bookSection,1978,"Stead, D.","Stead, D.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
121,YHESVZTV,bookSection,1978,"Ford, A.","Ford, A.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
126,6IFXAMKH,bookSection,1978,"Luján, M.","Luján, M.",,,,,,...,,102.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages


In [22]:
# Remove duplicates
df1 = df1.drop_duplicates(subset = ['key'])
df1.shape

(1029, 43)

### 3. Fix page numbers

In [23]:
# Fix date issue 

# abbreviated months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# fill null values with -1 
df2 = df1[['Begin Page', 'End Page']].fillna(value = '-1')

# if endpage has month then swap and put into new columns: 11-jan to jan-11 - so now only beginpage has month
df2['beginpage'] = np.where(df2['End Page'].isin(months), df2['End Page'], df2['Begin Page'])
df2['endpage'] = np.where(df2['End Page'].isin(months), df2['Begin Page'], df2['End Page'])

# if see month in startpage replace with numbers
for i,val in enumerate(months):
     df2['beginpage'] = df2['beginpage'].replace(val, str(i+1))

In [24]:
# Add truncated 1st number to end pages
df2['endpage'] = np.where(df2['beginpage'].astype('int') > df2['endpage'].astype('int'), 
                           df2['beginpage'].astype('string').str[:1] + df2['endpage'].astype('string') , df2['endpage'].astype('string'))

In [25]:
# replace Pages column
df2['Pages'] = df2['beginpage'].astype('string') + "-" + df2['endpage'].astype('string')

In [26]:
# fill in num pages column
df2['Num Pages'] = df2['endpage'].astype('int') - df2['beginpage'].astype('int')

In [27]:
# replace and add to df1
df1['Pages'] = df2['Pages']
df1['Begin Page'] = df2['beginpage']
df1['End Page'] = df2['endpage']
df1.insert(loc = (df1.columns.get_loc('End Page')+1), column = 'Num Pages', value = df2['Num Pages'])

### 4. Export to new excel!

In [28]:
# Export cleaned sheet to folder
# df1.to_excel('cleanedLSRL.xlsx')

## Fun Part! 

In [29]:
# created a new dataframe -> df3 for manipulation 
df3 = pd.read_excel('cleanedLSRL.xlsx')

### Titles
- avg length of titles
- longest title
- shortest title
- search articles that include a certain word
- counts of unique titles
- most common words used in titles

In [30]:
# take a look at the titles
df3['key'][250:260]

250                                Gemination In Italian
251    Phonological Implications Of The Perception Of...
252                           On Linear Order In Spanish
253                                       Controlled Pro
254    A Speech Event Analysis Of Tense And Aspect In...
255               Resumptive Pronoun Strategy In Spanish
256                Personal A, Kinesis And Individuation
257    An Autosegmental Approach To Syllabification I...
258    Theories Of Phonological Representation And Na...
259                                       Relativization
Name: key, dtype: object

In [31]:
# tokenize the titles and remove NaN
# add to a new dataframe
df3["tokenized_key"] = df3["key"].fillna("").map(lambda x: nltk.word_tokenize(x.lower())) 
print(df3['tokenized_key'][820:840])

820    [rhythmic, constraints, on, the, distribution,...
821    [subjects, and, wh-questions, :, some, new, ge...
822    [interpretive, deficit, ?, evidence, from, the...
823    [discontinuous, wh-constituents, in, brazilian...
824       [syntax, and, semantics, of, split, questions]
825    [parenthetical, null, topic, constructions, in...
826    [the, left, edge, in, the, spanish, clausal, s...
827    [rhotic, metathesis, asymmetries, in, romance,...
828      [the, domain, of, palatalization, in, romanian]
829    [putting, the, spanish, determiner, phrase, in...
830    [romanian, palatalization, :, the, role, of, p...
831    [proscriptions, …, gaps, …, and, something, in...
832    [discriminating, pitch, accent, alignment, in,...
833    [romance, paths, as, cognate, complements, :, ...
834    [antisymmetry, and, the, typology, of, relativ...
835    [crypto-variation, in, italian, velar, palatal...
836            [developing, i-language, in, l1, and, l2]
837    [licensing, negative, fr

In [42]:
# TO-DO: remove stop words and punctuations
punctuation = list(punctuation)

df3["nopunc_key"] = df3["tokenized_key"].map(lambda x: (t for t in x.split() if t not in stopwords))
print(df3['nopunc_key'][820:840])

AttributeError: 'list' object has no attribute 'split'

In [None]:
# average length of LSRL title
average_str_length = len(df3['tokenized_key'].sum())/len(df3)
print("The average length of an LSRL title is: ", average_str_length)

In [None]:
# longest title
longest_string = max(df3['tokenized_key'], key=len)
print("The longest title is :", longest_string) 

In [None]:
# shortest title
shortest_string = min(df3['tokenized_key'], key=len)
print("The shortest title is :", shortest_string) 

In [None]:
#find a row that contains relativization in 'key'
df3[df3['key'].str.contains('relativization')]

In [None]:
# counts unique titles
count = df3['tokenized_key'].value_counts()
print(count)

In [None]:
# return counts of words used in titles of the entire database 
s = pd.Series(Counter([y for x in df3['tokenized_key'] for y in x]))

# sort to find the most commonly used words in titles
y = s.sort_values(ascending = False)
pd.set_option('display.max_rows', None)
print(y)

### Authors
- most prolific writer
- avg number of writers per paper
- trends over time

In [None]:
## TO-DO: standardize the authors to make sure the order is reversed
## TO-DO: remove stop words and punctuations
# tokenize authors and remove null
df3["tokenized_author"] = df3["Author"].fillna("").map(lambda x: nltk.word_tokenize(x.lower())) 
df3["tokenized_author"]

In [None]:
# return counts of authors and sorts it
w = pd.Series(Counter([y for x in df3['tokenized_author'] for y in x]))
auteur = w.sort_values(ascending = False)
pd.set_option('display.max_rows', None)
print(auteur)

### Tags
- most common language
- area of linguistics most studied
- trends of topics over time

### Abstracts

In [None]:
df3[120:150]