# Imports

In [110]:
import numpy as np
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer

In [139]:
df = pd.read_csv('sc_cases.zip',compression='gzip')
df = df.head(10)

# Data Cleaning

In [271]:
print(df['opinion_text'][305])

Justice Souter delivered the opinion of the Court. 

  The question raised is whether conduct made a felony under state law but a misdemeanor under the Controlled Substances Act is a "felony punishable under the Controlled Substances Act."  18 U.S.C. § 924(c)(2).  We hold it is not. 

I 

A 

The Immigration and Nationality Act (INA) defines the term "aggravated felony" by a list that mentions "illicit trafficking in a controlled substance . . . including a drug trafficking crime (as defined in section 924(c) of title 18)."  § 101(a)(43)(B), as added by § 7342, 102 Stat. 4469, and as amended by § 222(a), 108 Stat. 4320, 8 U.S.C. § 1101(a)(43)(B).  The general phrase "illicit trafficking" is left undefined, but § 924(c)(2) of Title 18 identifies the subcategory by defining "drug trafficking crime" as "any felony punishable under the Controlled Substances Act" or under either of two other federal statutes having no bearing on this case.  Following the listing,§ 101(a)(43) of the INA prov

In [266]:
import re

dff = df.head(1)

dff['opinion_text'] = dff['opinion_text'].apply(lambda row: re.sub(r'\d', '', row))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [267]:
print(dff['opinion_text'][0])

JUSTICE GINSBURG delivered the opinion of the Court.

 A motion by a federal prisoner for postconviction relief under  U.S.C. §  is subject to a one-year time limitation that generally runs from "the date on which the judgment of conviction becomes final." § , P(). This case concerns the starting date for the one-year limitation. It presents a narrow but recurring question on which courts of appeals have divided: When a defendant in a federal prosecution takes an unsuccessful direct appeal from a judgment of conviction, but does not next petition for a writ of certiorari from this Court, does the judgment become "final" for postconviction relief purposes () when the appellate court issues its mandate affirming the conviction, or, instead, () on the date, ordinarily  days later, when the time for filing a petition for certiorari expires?

In accord with this Court's consistent understanding of finality in the context of collateral review, and the weight of lower court authority, we reje

# Feature Engineering

## Bag of words

In [213]:
vectorizer1 = CountVectorizer(lowercase=False, ngram_range=(1,3), min_df=30)
X = vectorizer1.fit_transform(df['opinion_text'])

In [214]:
print(X.toarray().shape)

(820, 20212)


In [215]:
print(vectorizer1.get_feature_names())



In [228]:
from sklearn.feature_selection import mutual_info_classif

res = dict(zip(vectorizer1.get_feature_names(), mutual_info_classif(X, df['authorship'], discrete_features=True)))
best3000 = sorted(res.items(), key=lambda x: x[1], reverse=True)[:3000]

choosenWords = []
for (word,value) in best3000:
    choosenWords.append(word)
    
choosenWords

['the',
 'of',
 'to',
 'that',
 'in',
 'and',
 'at',
 'is',
 'for',
 'of the',
 'not',
 'Ed',
 '2d',
 'Ed 2d',
 'or',
 'Ct',
 'The',
 'as',
 'by',
 'on',
 'be',
 'was',
 'it',
 'an',
 'with',
 'that the',
 'to the',
 'Court',
 'in the',
 'court',
 'have',
 'this',
 'from',
 'See',
 'States',
 'are',
 'its',
 'under',
 'we',
 'would',
 'any',
 'his',
 'law',
 'state',
 'which',
 'United',
 'Act',
 'has',
 'United States',
 'State',
 'Id',
 'had',
 'by the',
 'case',
 'Congress',
 'Id at',
 'on the',
 'no',
 'In',
 'federal',
 'their',
 'may',
 'see',
 'supra',
 'but',
 'and the',
 'he',
 'District',
 'statute',
 'other',
 'such',
 'the Court',
 'claim',
 'Government',
 'if',
 'evidence',
 'supra at',
 'were',
 'also',
 'Court of',
 '3d',
 'And',
 'for the',
 'But',
 'whether',
 'action',
 'when',
 'Appeals',
 'one',
 'We',
 'Court of Appeals',
 'of Appeals',
 'only',
 'Amendment',
 '2d at',
 'to be',
 'because',
 'with the',
 'than',
 'been',
 'the Government',
 'our',
 'does',
 'does n

## Function word
-> https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/

Articles = DT,PDT, WDT<br />
Pronouns = PRP, PRP\\$, WP, WP\$<br />
Conjunctions = CC<br />
Particles = RP<br />
Adpositions (Preposition) = IN<br />
Interjections = UH

Maybe = EX, MD (moda could,will), TO

Nops = CD, FW, JJx, NNx, (POS), RBx, VBx, WRB

In [190]:
df = pd.read_csv('sc_cases.zip',compression='gzip')

In [191]:
from nltk.tag import perceptron 
from nltk import word_tokenize
tagger = perceptron.PerceptronTagger()

tags = ['DT','PDT','WDT','PRP','PRP$','WP','WP$','CC','RP','IN','UH','EX','MD','TO']

df['tagged'] = df['opinion_text'].apply(lambda row: tagger.tag(word_tokenize(row)))

In [192]:
def funcword(row):
    functionwords = []
    for (word,tag) in row:
        if(tag in tags):
            functionwords.append(word.lower())
    return functionwords
            
df['funcWords'] = df['tagged'].apply(lambda row: funcword(row))
df['funcWords'] = df['funcWords'].apply(lambda l: " ".join(l))

In [196]:
vectorizer2 = CountVectorizer(lowercase=False, ngram_range=(1,1), min_df=2)
X2 = vectorizer2.fit_transform(df['funcWords'])

In [197]:
print(X2.toarray().shape)

(820, 358)


In [198]:
print(vectorizer2.get_feature_names())

['14because', '1although', '1in', '29', '2although', '2because', '3although', '479', '4although', '5although', '5to', '6although', '9although', 'aboard', 'about', 'above', 'absent', 'abuse', 'accept', 'accord', 'accords', 'acknowledge', 'across', 'address', 'advance', 'affirm', 'affords', 'after', 'again', 'against', 'ago', 'albeit', 'all', 'allege', 'allow', 'almost', 'along', 'alongside', 'alter', 'although', 'amid', 'among', 'an', 'and', 'another', 'answer', 'any', 'anytime', 'apart', 'appeal', 'appendix', 'applied', 'arbaugh', 'argue', 'arguendo', 'around', 'as', 'aside', 'assert', 'assess', 'assigns', 'at', 'attack', 'authorizes', 'automobile', 'avoids', 'award', 'awards', 'awareness', 'away', 'back', 'background', 'bear', 'because', 'before', 'behavior', 'behind', 'belong', 'below', 'beneath', 'beside', 'besides', 'best', 'between', 'beyond', 'bind', 'blurs', 'bore', 'both', 'bound', 'broadcast', 'brown', 'burden', 'but', 'by', 'ca', 'can', 'cause', 'congress', 'could', 'de', 'de

In [199]:
print(X2.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 2 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
