# LSRL Code

In [1]:
import pandas as pd
import nltk
import numpy as np

In [2]:
import bs4
import string

In [3]:
from collections import Counter

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Useful Functions

In [5]:
#create data frame
df = pd.read_excel('LSRL_Master2.xlsx')

In [6]:
# Access one column
# take a look at the citation counts, for instance
df['Extra'] 

0        2.0
1        5.0
2       14.0
3       17.0
4       18.0
        ... 
1045     NaN
1046     NaN
1047     NaN
1048     NaN
1049     NaN
Name: Extra, Length: 1050, dtype: float64

In [7]:
#what is the type of each column
df.dtypes 

                     object
Item Type            object
Publication Year      int64
Author               object
Author 1             object
                     ...   
Section             float64
Session             float64
Committee           float64
History             float64
Legislative Body    float64
Length: 100, dtype: object

In [8]:
# overall size of the sheet
df.shape

(1050, 100)

In [9]:
# remove rows using .drop()
# df1.drop([246, 247, 250, 255], axis = 0, inplace = True) 

In [10]:
# gives first five rows
df.head()

Unnamed: 0,Unnamed: 1,Item Type,Publication Year,Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,...,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,X2ZH3HVJ,bookSection,1972,"Rankin, Robert L.","Rankin, Robert L.",,,,,,...,,,,,,,,,,
1,63PPTCP7,bookSection,1972,"Saltarelli, Mario","Saltarelli, Mario",,,,,,...,,,,,,,,,,
2,8PLLMNAD,bookSection,1972,"Wanner, Dieter","Wanner, Dieter",,,,,,...,,,,,,,,,,
3,ABUSKF6A,bookSection,1972,"Rivero, Maria-Luisa","Rivero, María-Luisa",,,,,,...,,,,,,,,,,
4,H976X7J4,bookSection,1972,"Meyer, Paula L.","Meyer, Paula L.",,,,,,...,,,,,,,,,,


In [11]:
# all the details about this one row
df.loc[254]

                                 5P6HKL9W
Item Type                  journalArticle
Publication Year                     1984
Author              Callahan, Christopher
Author 1            Callahan, Christopher
                            ...          
Section                               NaN
Session                               NaN
Committee                             NaN
History                               NaN
Legislative Body                      NaN
Name: 254, Length: 100, dtype: object

## Clean up sheet

### 1. Removing null columns

In [12]:
# drop all columns (axis = 1) with all null values
# make it a copy to avoid SettingwithCopyWarning
df1 = df.dropna(axis = 1, how = 'all').copy() 

In [13]:
# new sheet has 43 columns
df1.shape

(1050, 43)

In [14]:
# remove rows with no publications
df1['key'].isnull().sum() # total 15 publications (2022)
df1.dropna(subset = ['key'], inplace = True) # remove them for now
df1.loc[len(df1)-1] # get the last item should be Gabriel Martínez

                                                                 DJMASDT6
Item Type                                                 conferencePaper
Publication Year                                                     2020
Author                                             Vera, Gabriel Martínez
Author 1                                           Vera, Gabriel Martínez
Author 2                                                              NaN
Author 3                                                              NaN
Author 4                                                              NaN
Author 5                                                              NaN
Author 6                                                              NaN
Title                   On recomplementation, high adverbs and V-movem...
key                     On recomplementation, high adverbs and V-movem...
Abstract                I address the structure of the Spanish left pe...
doi url                             ht

In [15]:
# new sheet has 1034 rows
df1.shape

(1035, 43)

### 2. Finding and removing duplicates

In [16]:
# Finding duplicates
df1[df1.duplicated(subset = ['key'])]

Unnamed: 0,Unnamed: 1,Item Type,Publication Year,Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,...,Library Catalog,Extra,Notes,Publication Type 2,File Attachments,Manual Tags,Editor,Meeting Name,Meeting Location,Conference Name
113,FKDHS8XD,bookSection,1978,"Reynolds, L.; Mez-Dombkowski, E.","Reynolds, L.","Mez-Dombkowski, E.",,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
116,LVXHD7LB,bookSection,1978,"Saltarelli, M.","Saltarelli, M.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
117,WDK32N3G,bookSection,1978,"Walker, D.","Walker, D.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
118,2GV9EJBE,bookSection,1978,"Stead, D.","Stead, D.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
121,YHESVZTV,bookSection,1978,"Ford, A.","Ford, A.",,,,,,...,,0.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages
126,6IFXAMKH,bookSection,1978,"Luján, M.","Luján, M.",,,,,,...,,102.0,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages


In [17]:
# Remove duplicates
df1 = df1.drop_duplicates(subset = ['key'])
df1.shape

(1029, 43)

### 3. Fix page numbers

In [18]:
# Fix date issue 

# abbreviated months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

# fill null values with -1 
df2 = df1[['Begin Page', 'End Page']].fillna(value = '-1')

# if endpage has month then swap and put into new columns: 11-jan to jan-11 - so now only beginpage has month
df2['beginpage'] = np.where(df2['End Page'].isin(months), df2['End Page'], df2['Begin Page'])
df2['endpage'] = np.where(df2['End Page'].isin(months), df2['Begin Page'], df2['End Page'])

# if see month in startpage replace with numbers
for i,val in enumerate(months):
     df2['beginpage'] = df2['beginpage'].replace(val, str(i+1))

In [19]:
# Add truncated 1st number to end pages
df2['endpage'] = np.where(df2['beginpage'].astype('int') > df2['endpage'].astype('int'), 
                           df2['beginpage'].astype('string').str[:1] + df2['endpage'].astype('string') , df2['endpage'].astype('string'))

In [20]:
# replace Pages column
df2['Pages'] = df2['beginpage'].astype('string') + "-" + df2['endpage'].astype('string')

In [21]:
# fill in num pages column
df2['Num Pages'] = df2['endpage'].astype('int') - df2['beginpage'].astype('int')

In [22]:
# replace and add to df1
df1['Pages'] = df2['Pages']
df1['Begin Page'] = df2['beginpage']
df1['End Page'] = df2['endpage']
df1.insert(loc = (df1.columns.get_loc('End Page')+1), column = 'Num Pages', value = df2['Num Pages'])

### 4. Export to new excel!

In [23]:
# Export cleaned sheet to folder
# df1.to_excel('cleanedLSRL.xlsx')

## Fun Part! 

In [37]:
# created a new dataframe -> df3 for manipulation 
df3 = pd.read_excel('cleanedLSRL.xlsx')

### Titles
- avg length of titles
- longest title
- shortest title
- search articles that include a certain word
- counts of unique titles
- most common words used in titles

In [25]:
# take a look at the titles
df3['key'][250:260]

256                                Gemination In Italian
257    Phonological Implications Of The Perception Of...
258                           On Linear Order In Spanish
259                                       Controlled Pro
260    A Speech Event Analysis Of Tense And Aspect In...
261               Resumptive Pronoun Strategy In Spanish
262                Personal A, Kinesis And Individuation
263    An Autosegmental Approach To Syllabification I...
264    Theories Of Phonological Representation And Na...
265                                       Relativization
Name: key, dtype: object

In [26]:
# tokenize the titles and remove NaN
# add to a new dataframe
df3["tokenized_key"] = df3["key"].fillna("").map(lambda x: nltk.word_tokenize(x.lower())) 
print(df3['tokenized_key'][820:840])

826    [rhythmic, constraints, on, the, distribution,...
827    [subjects, and, wh-questions, :, some, new, ge...
828    [interpretive, deficit, ?, evidence, from, the...
829    [discontinuous, wh-constituents, in, brazilian...
830       [syntax, and, semantics, of, split, questions]
831    [parenthetical, null, topic, constructions, in...
832    [the, left, edge, in, the, spanish, clausal, s...
833    [rhotic, metathesis, asymmetries, in, romance,...
834      [the, domain, of, palatalization, in, romanian]
835    [putting, the, spanish, determiner, phrase, in...
836    [romanian, palatalization, :, the, role, of, p...
837    [proscriptions, …, gaps, …, and, something, in...
838    [discriminating, pitch, accent, alignment, in,...
839    [romance, paths, as, cognate, complements, :, ...
840    [antisymmetry, and, the, typology, of, relativ...
841    [crypto-variation, in, italian, velar, palatal...
842            [developing, i-language, in, l1, and, l2]
843    [licensing, negative, fr

In [27]:
# TO-DO: remove stop words and punctuations


In [28]:
# average length of LSRL title
average_str_length = len(df3['tokenized_key'].sum())/len(df3)
print("The average length of an LSRL title is: ", average_str_length)

The average length of an LSRL title is:  8.718172983479105


In [29]:
# longest title
longest_string = max(df3['tokenized_key'], key=len)
print("The longest title is :", longest_string) 

The longest title is : ['the', 'mute', 'e', 'in', 'french', 'as', 'a', 'sociolinguistic', 'variable', ',', 'etudes', 'linguistiques', 'sur', 'les', 'langues', 'romanes', ',', '1978', ',', 'montreal', ',', 'p.', 'ii-20', '.', '6']


In [30]:
# shortest title
shortest_string = min(df3['tokenized_key'], key=len)
print("The shortest title is :", shortest_string) 

The shortest title is : ['relativization']


In [31]:
#find a row that contains relativization in 'key'
df3[df3['key'].str.contains('relativization')]

Unnamed: 0,Unnamed: 1,Item Type,Publication Year,Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,...,Extra,Notes,Publication Type 2,File Attachments,Manual Tags,Editor,Meeting Name,Meeting Location,Conference Name,tokenized_key
845,DA2RVSDK,journalArticle,2009,"Gutiérrez-Rexach, Javier","Gutiérrez-Rexach, Javier",,,,,,...,,<p>Accession Number: 2009932237. Series ISSN: ...,book article,,,"Masullo, Pascual José; O'Rourke, Erin; Huang, ...",LSRL37,Pittsburgh,Linguistic Symposium on Romance Languages,"[correlativization, and, degree, quantificatio..."


In [32]:
# counts unique titles
count = df3['tokenized_key'].value_counts()
print(count)

[a, minor, rule, with, historical, implications, in, rumanian]                                             1
[a, constraint-based, analysis, of, intraspeaker, variation, :, vocalic, epenthesis, in, vimeu, picard]    1
[a, comparative, semantics, for, the, subjunctive, mood, in, spanish]                                      1
[syntactic, constraints, on, access, to, lexical, structure, :, the, case, of, ficar]                      1
[the, role, of, syntactic, modifications, on, l2, oral, comprehension]                                     1
                                                                                                          ..
[the, middle, and, the, pseudo-middle, in, french]                                                         1
[the, case, for, a, syntax-dependent, postlexical, module, in, spanish, phonology]                         1
[consonant, strength, in, the, romance, dialects, of, the, pyrenees]                                       1
[problems, with, pa

In [33]:
# return counts of words used in titles of the entire database 
s = pd.Series(Counter([y for x in df3['tokenized_key'] for y in x]))

# sort to find the most commonly used words in titles
y = s.sort_values(ascending = False)
pd.set_option('display.max_rows', None)
print(y)

in                             658
and                            481
the                            458
of                             448
spanish                        280
:                              255
french                         219
a                              156
on                             145
,                              111
romance                        107
italian                         59
portuguese                      57
case                            53
from                            52
?                               43
to                              43
an                              42
old                             40
romanian                        38
'                               38
syntax                          35
evidence                        34
for                             33
clitic                          33
structure                       33
analysis                        32
subject                         32
agreement           

### Authors
- most prolific writer
- avg number of writers per paper
- trends over time

In [34]:
## TO-DO: standardize the authors to make sure the order is reversed
## TO-DO: remove stop words and punctuations
# tokenize authors and remove null
df3["tokenized_author"] = df3["Author"].fillna("").map(lambda x: nltk.word_tokenize(x.lower())) 
df3["tokenized_author"]

0                               [rankin, ,, robert, l, .]
1                                  [saltarelli, ,, mario]
2                                     [wanner, ,, dieter]
3                                [rivero, ,, maria-luisa]
4                                 [meyer, ,, paula, l, .]
5                                       [lujan, ,, marta]
6                              [gulstad, ,, daniel, e, .]
7                                 [goldin, ,, mark, g, .]
8                              [dinnsen, ,, daniel, a, .]
9                                       [foley, ,, james]
10                               [harris, ,, james, w, .]
11                            [cressey, ,, william, w, .]
12                               [hensey, ,, fritz, g, .]
13                           [patterson, ,, george, w, .]
14                                        [paff, ,, toby]
15                           [langacker, ,, ronald, w, .]
16                              [kayne, ,, richard, s, .]
17            

In [35]:
# return counts of authors and sorts it
w = pd.Series(Counter([y for x in df3['tokenized_author'] for y in x]))
auteur = w.sort_values(ascending = False)
pd.set_option('display.max_rows', None)
print(auteur)

,                      1270
;                       240
.                       230
a                        42
m                        36
e                        29
j                        29
john                     26
l                        21
james                    18
mario                    18
saltarelli               17
w                        17
maria                    16
c                        14
jean-pierre              14
herschensohn             14
d                        13
montreuil                13
julia                    13
michael                  13
robert                   13
mary                     13
tranel                   13
harris                   12
contreras                11
m.                       11
suñer                    11
arteaga                  10
carmen                   10
j.                       10
rivero                   10
josé                     10
heles                    10
bernard                  10
margarita           

### Tags
- most common language
- area of linguistics most studied
- trends of topics over time

### Abstracts

In [36]:
df3[120:150]

Unnamed: 0,Unnamed: 1,Item Type,Publication Year,Author,Author 1,Author 2,Author 3,Author 4,Author 5,Author 6,...,Notes,Publication Type 2,File Attachments,Manual Tags,Editor,Meeting Name,Meeting Location,Conference Name,tokenized_key,tokenized_author
125,ACJYGQ34,bookSection,1978,"DILLER, AM","DILLER, AM",,,,,,...,,book article,,,"Morin, YC",LSRL06,Université de Québec à Montréal,Linguistic Symposium on Romance Languages,"[the, mute, e, in, french, as, a, sociolinguis...","[diller, ,, am]"
127,JMG8HM3I,journalArticle,1978,"Saltarelli, Mario","Saltarelli, Mario",,,,,,...,<p>Accession Number: 1978305064. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[sentential, clitics, and, clause, reduction, ...","[saltarelli, ,, mario]"
128,L6W9Q2RJ,journalArticle,1978,"Otheguy, Ricardo","Otheguy, Ricardo",,,,,,...,<p>Accession Number: 1978304911. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[a, semantic, analysis, of, the, difference, b...","[otheguy, ,, ricardo]"
129,HUP6C33V,journalArticle,1978,"Lantolf, James P.","Lantolf, James P.",,,,,,...,<p>Accession Number: 1978304909. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[the, variable, constraints, on, mood, in, pue...","[lantolf, ,, james, p, .]"
130,ZVP84UKF,journalArticle,1978,"Guitart, Jorge M.","Guitart, Jorge M.",,,,,,...,<p>Accession Number: 1978304904. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[aspects, of, spanish, aspect, :, a, new, look...","[guitart, ,, jorge, m, .]"
131,TTASLDHW,journalArticle,1978,"Davis, Carroll N.","Davis, Carroll N.",,,,,,...,<p>Accession Number: 1978304903. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[generative, semantic, analysis, of, tense, in...","[davis, ,, carroll, n, .]"
132,NJ9V75ZQ,journalArticle,1978,"Whitley, Stanley","Whitley, Stanley",,,,,,...,<p>Accession Number: 1978304900. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[rule, reordering, in, the, phonological, hist...","[whitley, ,, stanley]"
133,V7IJIE6G,journalArticle,1978,"Cressey, William W.","Cressey, William W.",,,,,,...,<p>Accession Number: 1978304881. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[absolute, neutralization, of, the, phonemic, ...","[cressey, ,, william, w, .]"
134,PQHJFE3V,journalArticle,1978,"Velleman, Barry L.","Velleman, Barry L.",,,,,,...,<p>Accession Number: 1978304761. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[latinist, and, universal, models, in, spanish...","[velleman, ,, barry, l, .]"
135,5YX92DSK,journalArticle,1978,"Redenbarger, Wayne J.","Redenbarger, Wayne J.",,,,,,...,<p>Accession Number: 1978304709. Publication T...,book article,,,"Suner, Margarita",LSRL07,Cornell,Linguistic Symposium on Romance Languages,"[portuguese, vowel, harmony, and, the, 'elsewh...","[redenbarger, ,, wayne, j, .]"
